aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/preserve_mostcc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll1048
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll2564
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll1050
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll3064
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll842
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll9954
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll363
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll1503
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll295
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll357
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll370
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll534
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll839
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll756
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll1297
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_v2i128.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll63
-rw-r--r--llvm/test/CodeGen/AMDGPU/finalizebundle.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoi.i128.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll153
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll518
-rw-r--r--llvm/test/CodeGen/AMDGPU/limit-coalesce.mir63
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll21
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i32.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i8.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i16.ll214
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll58
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll1931
-rw-r--r--llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll203
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir131
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-simple.ll2346
-rw-r--r--llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-realign.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll466
-rw-r--r--llvm/test/CodeGen/RISCV/mask-variable-shift.ll132
-rw-r--r--llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir45
-rw-r--r--llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll12
71 files changed, 16088 insertions, 16272 deletions
diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 75c8567..f77ada4 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
-; RUN: llc < %s -mtriple=aarch64-unknown-windiws-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
declare void @standard_cc_func()
declare preserve_mostcc void @preserve_mostcc_func()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c2129c2..6076a2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -33,7 +33,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
-; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -51,6 +50,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60
; GCN-NEXT: v_mov_b32_e32 v0, s52
+; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
; GCN-NEXT: v_mov_b32_e32 v0, s53
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23..7b5621f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
%tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, v1
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
%tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0b..2351c96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v3, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7..7f10ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, v0
-; GCN-NEXT: v_mov_b32_e32 v5, v1
-; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT: v_mov_b32_e32 v5, v2
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
@@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT: v_mov_b32_e32 v2, v8
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
@@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
@@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
; GFX11-NEXT: v_mov_b32_e32 v2, v9
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i96:
@@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
; GFX12-NEXT: v_mov_b32_e32 v2, v8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT: v_mov_b32_e32 v10, v2
+; GFX7-NEXT: v_mov_b32_e32 v11, v3
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v12, v4
+; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT: v_mov_b32_e32 v2, v11
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT: v_mov_b32_e32 v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT: v_mov_b32_e32 v10, v2
+; GFX8-NEXT: v_mov_b32_e32 v11, v3
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v12, v4
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v11
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT: v_mov_b32_e32 v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
@@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v2, v11
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
+; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13
+; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
@@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
; GFX11-NEXT: v_mov_b32_e32 v12, v3
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6
-; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, v11
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT: v_mov_b32_e32 v2, v13
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
@@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v10, v1
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b32_e32 v13, v10
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mov_b32_e32 v11, v12
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mov_b32_e32 v22, v18
-; GFX7-NEXT: v_mov_b32_e32 v18, v19
-; GFX7-NEXT: v_mov_b32_e32 v19, v16
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mov_b32_e32 v21, v22
+; GFX7-NEXT: v_mov_b32_e32 v22, v23
+; GFX7-NEXT: v_mov_b32_e32 v23, v18
+; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX7-NEXT: v_mov_b32_e32 v20, v23
+; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT: v_mov_b32_e32 v21, v20
-; GFX7-NEXT: v_mov_b32_e32 v20, v11
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX7-NEXT: v_mov_b32_e32 v12, v22
+; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-NEXT: v_mov_b32_e32 v2, v14
+; GFX7-NEXT: v_mov_b32_e32 v7, v11
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mov_b32_e32 v22, v18
-; GFX8-NEXT: v_mov_b32_e32 v18, v19
-; GFX8-NEXT: v_mov_b32_e32 v19, v16
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mov_b32_e32 v21, v22
+; GFX8-NEXT: v_mov_b32_e32 v22, v23
+; GFX8-NEXT: v_mov_b32_e32 v23, v18
+; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX8-NEXT: v_mov_b32_e32 v20, v23
+; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT: v_mov_b32_e32 v21, v20
-; GFX8-NEXT: v_mov_b32_e32 v20, v11
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v12, v22
+; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-NEXT: v_mov_b32_e32 v2, v14
+; GFX8-NEXT: v_mov_b32_e32 v7, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mov_b32_e32 v22, v18
-; GFX9-NEXT: v_mov_b32_e32 v18, v19
-; GFX9-NEXT: v_mov_b32_e32 v19, v16
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mov_b32_e32 v21, v22
+; GFX9-NEXT: v_mov_b32_e32 v22, v23
+; GFX9-NEXT: v_mov_b32_e32 v23, v18
+; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX9-NEXT: v_mov_b32_e32 v20, v23
+; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
-; GFX9-NEXT: v_mov_b32_e32 v20, v11
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX9-NEXT: v_mov_b32_e32 v12, v22
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-NEXT: v_mov_b32_e32 v2, v14
+; GFX9-NEXT: v_mov_b32_e32 v7, v11
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: v_mov_b32_e32 v17, v1
-; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT: v_mov_b32_e32 v20, v22
-; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT: v_mov_b32_e32 v20, v18
+; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19]
+; GFX10-NEXT: v_mov_b32_e32 v18, v23
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX10-NEXT: v_mov_b32_e32 v19, v24
+; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v19, v22
-; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX10-NEXT: v_mov_b32_e32 v18, v21
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22]
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v13, v1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v14, v21
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6
+; GFX10-NEXT: v_mov_b32_e32 v14, v20
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22]
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4
; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15
; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v20, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT: v_mov_b32_e32 v21, v22
-; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12
+; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v6, v25
-; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v23
+; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX11-NEXT: v_mov_b32_e32 v8, v24
+; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8]
+; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT: v_mov_b32_e32 v11, v1
-; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT: v_mov_b32_e32 v12, v24
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22]
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22]
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT: v_mov_b32_e32 v20, v22
+; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mov_b32_e32 v18, v23
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX12-NEXT: v_mov_b32_e32 v19, v24
+; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19]
; GFX12-NEXT: v_mov_b32_e32 v19, v22
-; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX12-NEXT: v_mov_b32_e32 v18, v21
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT: v_mov_b32_e32 v20, v18
-; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22]
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT: v_mov_b32_e32 v13, v18
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2
; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2
; GFX1250-NEXT: v_mov_b32_e32 v2, v15
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT: flat_load_dword v4, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v4, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454..01c601f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3
; CHECK-NEXT: v_trunc_f32_e32 v8, v6
; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7
+; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s13
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s13
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v1, s11
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
@@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6]
; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc
; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10
; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v3, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; CGP-NEXT: v_trunc_f32_e32 v5, v4
; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v18, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v15
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v14, v4
+; CGP-NEXT: v_xor_b32_e32 v13, v10, v15
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_mul_hi_u32 v12, v11, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v13, v4
; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5]
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v3, v15, v0
+; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v3
; CGP-NEXT: v_xor_b32_e32 v1, v2, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v5, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CGP-NEXT: v_trunc_f32_e32 v7, v6
; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v6
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v9, v12, v6
+; CGP-NEXT: v_xor_b32_e32 v14, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT: v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_mul_hi_u32 v10, v9, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v14, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v14, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v14, v6
; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v13, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %num, 1235195
ret i64 %result
@@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16
; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
@@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
@@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
@@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CHECK-NEXT: v_trunc_f32_e32 v7, v6
; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
@@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
; GISEL-NEXT: v_trunc_f32_e32 v13, v11
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6
; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11
; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1
; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
@@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v10, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10
; CGP-NEXT: v_trunc_f32_e32 v12, v11
; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_lo_u32 v12, v16, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_mul_lo_u32 v11, v19, v10
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v13, v16, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v14
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v8, v14
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v19, v14
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT: v_mul_lo_u32 v8, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v16, v14
+; CGP-NEXT: v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v9, v19, v14
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v16, v14
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v19, v14
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v18, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v18, v8
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v18, v9
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v9
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v8, v17, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v8
; CGP-NEXT: v_xor_b32_e32 v1, v4, v8
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v6, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
; CGP-NEXT: v_trunc_f32_e32 v10, v8
; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v14, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v10, v5, v12
-; CGP-NEXT: v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_lo_u32 v5, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v9
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v11, v9
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v10, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v13, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v6
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3
; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7
; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2
; GISEL-NEXT: v_trunc_f32_e32 v8, v6
; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
; GISEL-NEXT: v_mov_b32_e32 v2, v7
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6
+; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3]
+; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7]
; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0
+; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3
; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v2
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v8, v3
+; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0
+; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v6, v2, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591..f4489c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s17, 31
; GFX9-NEXT: s_ashr_i32 s4, s19, 31
@@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3]
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
@@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX8-NEXT: s_ashr_i32 s6, s19, 31
; GFX8-NEXT: s_mov_b32 s7, s6
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: s_ashr_i32 s10, s3, 31
-; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1
-; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3
+; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1]
; GFX8-NEXT: s_add_u32 s0, s18, s6
; GFX8-NEXT: s_addc_u32 s1, s19, s6
; GFX8-NEXT: s_add_u32 s2, s2, s10
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: s_addc_u32 s3, s3, s10
; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4
; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0
; GFX8-NEXT: s_sub_u32 s5, 0, s2
-; GFX8-NEXT: s_subb_u32 s20, 0, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2]
+; GFX8-NEXT: s_subb_u32 s20, 0, s3
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc
+; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6
; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s17
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7
+; GFX8-NEXT: v_mov_b32_e32 v6, s17
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5
+; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2
+; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7
; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v12, s9
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9
; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s6
+; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX9-NEXT: s_ashr_i32 s6, s19, 31
; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s18, s6
; GFX9-NEXT: s_addc_u32 s1, s19, s6
; GFX9-NEXT: s_add_u32 s2, s2, s10
; GFX9-NEXT: s_mov_b32 s11, s10
; GFX9-NEXT: s_addc_u32 s3, s3, s10
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14
; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc
; GFX9-NEXT: s_subb_u32 s20, 0, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4
; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v9, s17
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, s17
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1
+; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc
; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT: v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9
; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v12, s9
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14
; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-NEXT: v_mov_b32_e32 v7, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, s6
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13]
; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0..6f42239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v6, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s11
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s11
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s9
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v11, v9
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
+; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1
; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
@@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9
; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10]
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v12, v10
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v12, v9
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9
+; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5
+; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v2, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v3
; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v3
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v5, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v12
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v3, v15
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_xor_b32_e32 v16, v4, v15
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT: v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v16, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v16, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v16, v3
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v15
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v15
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
; CGP-NEXT: .LBB2_2: ; %Flow1
@@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v5
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_xor_b32_e32 v14, v6, v13
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v14, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v4
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v14, v5
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v14, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 4096
ret i64 %result
@@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 1235195
ret i64 %result
@@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v7, v5
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v12, v10
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10
; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11
; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6
; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
@@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6
; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8
; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8
; GISEL-NEXT: v_mov_b32_e32 v0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1]
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
; GISEL-NEXT: v_trunc_f32_e32 v13, v10
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0
+; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2]
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v12, v10
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v12, v15, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v11
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v13
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v14, v4, v16
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v13
+; CGP-NEXT: v_xor_b32_e32 v17, v8, v16
+; CGP-NEXT: v_mul_hi_u32 v8, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT: v_mul_lo_u32 v8, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v13
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT: v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v17, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v14, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v17, v8
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v8
; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v17, v8
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc
+; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: .LBB8_2: ; %Flow1
@@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v6
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v4, v12
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v10, v4, v14
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0
+; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8
; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v11, v7
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4
+; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6]
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc
; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; CGP-NEXT: v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17..10e83b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
-; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT: s_lshr_b32 s1, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6..23ef596 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-NEXT: v_mov_b32_e32 v4, s19
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
@@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s13
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX8-NEXT: s_sub_u32 s2, 0, s14
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_subb_u32 s3, 0, s15
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1]
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc
+; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc
; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1
+; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX8-NEXT: v_trunc_f32_e32 v4, v3
; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2
+; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8
+; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc
+; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6
+; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
+; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16
+; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13
+; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6]
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc
+; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4
+; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7
+; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4
; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5
+; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7
; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1]
@@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX9-NEXT: s_sub_u32 s2, 0, s6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_subb_u32 s3, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
@@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1]
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1
+; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v4, v3
; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2
+; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9
+; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6
+; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2
+; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6
+; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, v8, v5
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13
; GFX9-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4
+; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7
+; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7
; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6
-; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3]
; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4
; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5
; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7..4a22a91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a5..746ffcf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB12_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB12_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -3946,8 +3944,24 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -4295,44 +4309,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v32i32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -5113,9 +5111,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5255,15 +5254,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55
; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -5280,12 +5272,23 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v32i32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -5302,9 +5305,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -5437,7 +5437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -5493,7 +5492,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -5508,7 +5507,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -5520,149 +5519,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -5670,7 +5667,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -5698,7 +5697,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: s_cbranch_execz .LBB12_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_u32_e32 v31, 3, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_u32_e32 v30, 3, v30
@@ -6006,9 +6005,25 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -6280,22 +6295,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -6755,7 +6754,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -6776,10 +6779,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -7416,7 +7415,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -10666,7 +10665,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v16, s32
; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8
@@ -11599,7 +11598,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-NEXT: v_readlane_b32 s34, v16, 2
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v16, off, s32
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8
@@ -11812,13 +11811,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -11979,44 +11991,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB14_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -12025,11 +12023,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -12632,7 +12630,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB14_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -12646,8 +12643,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -13327,13 +13324,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -13470,34 +13479,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -13983,7 +13978,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -14561,13 +14555,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -14709,34 +14717,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -15223,7 +15217,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -16362,7 +16355,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -16395,7 +16388,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -17336,7 +17329,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -17369,7 +17362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -18086,24 +18079,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB15_3
; SI-NEXT: .LBB15_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -18114,10 +18096,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB15_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -18127,7 +18121,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -18722,13 +18715,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -18956,11 +18949,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -18970,11 +18963,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -18982,6 +18972,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19190,12 +19182,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -19213,6 +19199,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -19222,7 +19214,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -19235,7 +19227,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -19820,8 +19811,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -20000,16 +19991,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20036,9 +20029,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -20054,14 +20046,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -20073,10 +20067,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20089,10 +20084,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -20106,17 +20103,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20132,45 +20134,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20221,18 +20202,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB15_3
; GFX9-NEXT: .LBB15_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -20246,6 +20215,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -20683,7 +20664,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -20716,7 +20697,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -21573,7 +21554,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -21606,7 +21587,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -21624,7 +21605,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -21657,7 +21638,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -22514,7 +22495,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -22547,7 +22528,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -23110,10 +23091,25 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
@@ -23292,22 +23288,6 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -26129,7 +26109,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -26146,9 +26129,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26714,7 +26694,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -29181,7 +29161,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -29214,7 +29194,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -29247,7 +29227,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -30049,7 +30029,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -30082,7 +30062,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -30115,7 +30095,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -30155,7 +30135,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -30188,7 +30168,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -30221,7 +30201,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -30913,7 +30893,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -30946,7 +30926,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -30979,7 +30959,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -31788,6 +31768,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -31807,22 +31803,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -32493,22 +32473,6 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v7
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -32524,6 +32488,22 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -34732,7 +34712,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -34765,7 +34745,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -34798,7 +34778,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -34876,7 +34856,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -34909,7 +34889,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -34942,7 +34922,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -35000,6 +34980,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -35016,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -35051,14 +35031,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr39
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB24_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -35103,7 +35082,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB24_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
@@ -35330,6 +35308,22 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
@@ -35356,7 +35350,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -35369,22 +35363,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -36338,7 +36316,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -36370,12 +36354,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -36391,7 +36369,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36608,7 +36585,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -37782,7 +37758,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -37815,7 +37791,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -37848,7 +37824,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -37926,7 +37902,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -37959,7 +37935,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -37992,7 +37968,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -40033,22 +40009,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-LABEL: bitcast_v32f32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -40181,6 +40141,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -40212,14 +40188,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB36_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -40451,7 +40426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB36_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -40874,8 +40848,24 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -41223,44 +41213,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v32f32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -41365,6 +41323,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -41470,129 +41444,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -42041,9 +42015,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -42183,15 +42158,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55
; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -42208,12 +42176,23 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v32f32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -42230,9 +42209,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -42365,7 +42341,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -42421,7 +42396,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -42436,7 +42411,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -42448,149 +42423,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -42598,7 +42571,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -42626,7 +42601,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: s_cbranch_execz .LBB36_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30
@@ -42934,9 +42909,25 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -43208,22 +43199,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -43666,7 +43641,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -43687,10 +43666,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -44310,7 +44285,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -44770,27 +44745,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24
-; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8
-; SI-NEXT: v_add_f32_e64 v53, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v52, s22, 1.0
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
@@ -44842,24 +44801,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12
+; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; SI-NEXT: v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT: v_add_f32_e64 v52, s22, 1.0
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26
@@ -44868,6 +44836,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26
; SI-NEXT: v_add_f32_e64 v41, s21, 1.0
; SI-NEXT: v_add_f32_e64 v40, s20, 1.0
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26
@@ -44875,6 +44845,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_add_f32_e64 v57, s16, 1.0
; SI-NEXT: v_add_f32_e64 v46, s19, 1.0
; SI-NEXT: v_add_f32_e64 v45, s18, 1.0
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8
; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -44885,6 +44856,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24
; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24
; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8
@@ -45408,33 +45381,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v13, s98
+; SI-NEXT: v_mov_b32_e32 v27, s62
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s46
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s56
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s58
-; SI-NEXT: v_mov_b32_e32 v27, s62
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s46
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s72
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s56
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s74
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s58
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s76
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v57, s16
@@ -45468,6 +45441,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_readlane_b32 s5, v61, 1
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v13, s60
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s78
@@ -45694,9 +45668,22 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v29
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v17
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
@@ -45809,17 +45796,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; SI-NEXT: v_or_b32_e32 v13, v14, v13
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17
; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: v_or_b32_e32 v13, v13, v14
@@ -46070,19 +46056,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -46687,6 +46660,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: s_branch .LBB37_2
; VI-NEXT: .LBB37_4:
+; VI-NEXT: v_mov_b32_e32 v53, s46
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s56
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_readlane_b32 s4, v62, 0
; VI-NEXT: v_mov_b32_e32 v48, s4
@@ -46764,6 +46741,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s58
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 27
@@ -46841,6 +46821,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 51
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s60
; VI-NEXT: v_readlane_b32 s4, v62, 52
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
@@ -46859,40 +46842,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 57
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
-; VI-NEXT: v_mov_b32_e32 v53, s46
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s56
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s58
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s60
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s62
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s72
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s74
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s76
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s78
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s88
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s90
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, s16
; VI-NEXT: v_mov_b32_e32 v32, s17
; VI-NEXT: v_mov_b32_e32 v29, s18
@@ -46946,11 +46895,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v42, s82
; VI-NEXT: v_mov_b32_e32 v37, s81
; VI-NEXT: v_mov_b32_e32 v50, s80
-; VI-NEXT: v_mov_b32_e32 v53, s30
-; VI-NEXT: v_mov_b32_e32 v54, s34
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s36
; VI-NEXT: v_mov_b32_e32 v40, s38
; VI-NEXT: v_mov_b32_e32 v41, s48
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s62
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s72
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s74
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s76
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s78
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s88
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s90
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s30
+; VI-NEXT: v_mov_b32_e32 v54, s34
; VI-NEXT: .LBB37_5: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34
; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35
@@ -47018,6 +46991,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50
; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: v_readlane_b32 s87, v63, 31
; VI-NEXT: v_readlane_b32 s86, v63, 30
; VI-NEXT: v_readlane_b32 s85, v63, 29
@@ -47050,7 +47037,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0
@@ -47341,20 +47328,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
@@ -48123,10 +48096,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 49
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s4
-; GFX9-NEXT: v_mov_b32_e32 v49, s52
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s46
-; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -48175,6 +48146,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s94
+; GFX9-NEXT: v_mov_b32_e32 v49, s52
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -48222,6 +48194,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v54, s55
; GFX9-NEXT: v_mov_b32_e32 v50, s53
; GFX9-NEXT: v_mov_b32_e32 v60, s54
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v49, s51
; GFX9-NEXT: v_mov_b32_e32 v59, s50
; GFX9-NEXT: v_mov_b32_e32 v58, s49
@@ -48291,6 +48264,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s99, v63, 35
; GFX9-NEXT: v_readlane_b32 s98, v63, 34
; GFX9-NEXT: v_readlane_b32 s97, v63, 33
@@ -48327,7 +48314,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
@@ -48621,20 +48608,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
@@ -48646,7 +48619,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -48681,7 +48654,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -49601,7 +49574,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -49663,7 +49636,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -49876,13 +49849,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -50043,44 +50029,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB38_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -50089,11 +50061,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -50696,7 +50668,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB38_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -50710,8 +50681,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -51391,13 +51362,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -51534,34 +51517,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -52047,7 +52016,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -52625,13 +52593,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -52773,34 +52755,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -53287,7 +53255,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -54426,7 +54393,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -54459,7 +54426,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -55400,7 +55367,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB38_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -55433,7 +55400,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -56150,24 +56117,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB39_3
; SI-NEXT: .LBB39_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -56178,10 +56134,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB39_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -56191,7 +56159,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -56786,13 +56753,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -57020,11 +56987,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -57034,11 +57001,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -57046,6 +57010,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -57254,12 +57220,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB39_3
; VI-NEXT: .LBB39_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -57277,6 +57237,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -57286,7 +57252,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -57299,7 +57265,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -57884,8 +57849,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -58064,16 +58029,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58100,9 +58067,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -58118,14 +58084,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -58137,10 +58105,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58153,10 +58122,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -58170,17 +58141,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58196,45 +58172,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58285,18 +58240,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB39_3
; GFX9-NEXT: .LBB39_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -58310,6 +58253,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -58747,7 +58702,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -58780,7 +58735,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -59637,7 +59592,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -59670,7 +59625,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -59688,7 +59643,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -59721,7 +59676,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -60578,7 +60533,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -60611,7 +60566,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -61174,10 +61129,25 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
@@ -61356,22 +61326,6 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -62140,6 +62094,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s99, v63, 35
; SI-NEXT: v_readlane_b32 s98, v63, 34
; SI-NEXT: v_readlane_b32 s97, v63, 33
@@ -62176,10 +62144,9 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_readlane_b32 s34, v63, 2
; SI-NEXT: v_readlane_b32 s31, v63, 1
; SI-NEXT: v_readlane_b32 s30, v63, 0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
@@ -62193,20 +62160,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -64239,7 +64192,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -64256,9 +64212,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -64824,7 +64777,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -67291,7 +67244,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -67324,7 +67277,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -67357,7 +67310,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -68159,7 +68112,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -68192,7 +68145,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -68225,7 +68178,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -68265,7 +68218,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -68298,7 +68251,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -68331,7 +68284,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -69023,7 +68976,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -69056,7 +69009,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -69089,7 +69042,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -69898,6 +69851,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -69917,22 +69886,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -70603,11 +70556,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -70624,6 +70572,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB45_4:
@@ -72813,7 +72766,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -72846,7 +72799,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -72879,7 +72832,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -72957,7 +72910,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -72990,7 +72943,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -73023,7 +72976,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -73081,6 +73034,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-LABEL: bitcast_v32f32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -73097,10 +73054,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -73132,14 +73085,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr39
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB48_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -73184,7 +73136,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB48_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
@@ -73411,6 +73362,22 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
@@ -73437,7 +73404,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -73450,22 +73417,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -73951,9 +73902,25 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
@@ -73979,22 +73946,6 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB49_4:
@@ -74373,7 +74324,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -74405,12 +74362,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -74426,7 +74377,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -74643,7 +74593,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -75817,7 +75766,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -75850,7 +75799,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -75883,7 +75832,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -75961,7 +75910,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -75994,7 +75943,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -76027,7 +75976,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -77054,22 +77003,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-LABEL: bitcast_v16i64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -77202,6 +77135,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -77233,14 +77182,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB56_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -77501,7 +77449,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
@@ -77895,8 +77842,24 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
@@ -78244,44 +78207,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v16i64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -78386,6 +78317,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -78491,129 +78438,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -79062,9 +79009,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -79204,15 +79152,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55
; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -79229,12 +79170,23 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v16i64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -79251,9 +79203,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -79386,7 +79335,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -79442,7 +79390,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -79457,7 +79405,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -79469,149 +79417,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -79619,7 +79565,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -79676,7 +79624,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29
; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31
; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -79955,9 +79903,25 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -80229,22 +80193,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -80712,7 +80660,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -80733,10 +80685,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -81381,7 +81329,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -84631,7 +84579,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v16, s32
; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8
@@ -85566,7 +85514,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-NEXT: v_readlane_b32 s34, v16, 2
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v16, off, s32
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8
@@ -85779,13 +85727,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -85946,44 +85907,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -85992,11 +85939,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -86599,7 +86546,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -86613,8 +86559,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -87294,13 +87240,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -87437,34 +87395,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -87950,7 +87894,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -88528,13 +88471,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -88676,34 +88633,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -89190,7 +89133,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -90329,7 +90271,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -90362,7 +90304,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -91303,7 +91245,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB58_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -91336,7 +91278,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -92053,24 +91995,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB59_3
; SI-NEXT: .LBB59_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -92081,10 +92012,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB59_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -92094,7 +92037,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -92689,13 +92631,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -92923,11 +92865,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -92937,11 +92879,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -92949,6 +92888,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -93157,12 +93098,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB59_3
; VI-NEXT: .LBB59_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -93180,6 +93115,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -93189,7 +93130,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -93202,7 +93143,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -93787,8 +93727,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -93967,16 +93907,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -94003,9 +93945,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -94021,14 +93962,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -94040,10 +93983,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94056,10 +94000,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -94073,17 +94019,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -94099,45 +94050,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94188,18 +94118,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB59_3
; GFX9-NEXT: .LBB59_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -94213,6 +94131,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -94650,7 +94580,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -94683,7 +94613,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -95540,7 +95470,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -95573,7 +95503,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -95591,7 +95521,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -95624,7 +95554,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -96481,7 +96411,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -96514,7 +96444,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -97078,10 +97008,25 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
@@ -97260,22 +97205,6 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -100084,7 +100013,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -100101,9 +100033,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -100669,7 +100598,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -103136,7 +103065,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -103169,7 +103098,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -103202,7 +103131,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -104004,7 +103933,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -104037,7 +103966,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -104070,7 +103999,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -104110,7 +104039,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -104143,7 +104072,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -104176,7 +104105,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -104868,7 +104797,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -104901,7 +104830,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -104934,7 +104863,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -105740,6 +105669,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -105759,22 +105704,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -106453,22 +106382,6 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v7
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -106484,6 +106397,22 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -108700,7 +108629,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -108733,7 +108662,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -108766,7 +108695,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -108844,7 +108773,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -108877,7 +108806,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -108910,7 +108839,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -108968,6 +108897,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-LABEL: bitcast_v16i64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -108984,10 +108917,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr63
@@ -109019,14 +108948,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB68_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -109099,7 +109027,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
@@ -109296,6 +109223,22 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
@@ -109322,7 +109265,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -109335,22 +109278,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -110320,7 +110247,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -110352,12 +110285,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -110373,7 +110300,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -110590,7 +110516,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -111764,7 +111689,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -111797,7 +111722,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -111830,7 +111755,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -111908,7 +111833,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -111941,7 +111866,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -111974,7 +111899,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -112032,22 +111957,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -112180,6 +112089,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr42
@@ -112211,14 +112136,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -112449,7 +112373,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
@@ -112827,8 +112750,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v11
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
@@ -113206,44 +113145,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v16f64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -113346,6 +113253,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -113448,132 +113371,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11
; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10
; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10
@@ -114009,9 +113932,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -114151,17 +114075,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -114178,12 +114092,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v16f64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -114200,9 +114128,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -114335,7 +114260,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114395,7 +114319,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -114408,7 +114332,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: s_waitcnt vmcnt(49)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -114416,152 +114340,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114571,6 +114494,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9
@@ -114599,7 +114523,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB72_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
-; GFX9-NEXT: s_waitcnt vmcnt(30)
+; GFX9-NEXT: s_waitcnt vmcnt(46)
; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
@@ -114904,8 +114828,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -115170,22 +115110,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -115628,7 +115552,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -115649,10 +115577,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -116272,7 +116196,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -117056,6 +116980,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v33, s4
; SI-NEXT: v_readlane_b32 s4, v61, 39
; SI-NEXT: v_mov_b32_e32 v30, s4
+; SI-NEXT: v_mov_b32_e32 v29, s46
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v29, s98
; SI-NEXT: v_readlane_b32 s4, v61, 40
; SI-NEXT: v_mov_b32_e32 v34, s4
; SI-NEXT: v_readlane_b32 s4, v61, 41
@@ -117148,6 +117077,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v25, s4
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v29, s96
; SI-NEXT: v_readlane_b32 s4, v62, 0
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -117204,20 +117137,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v25, s4
-; SI-NEXT: v_mov_b32_e32 v29, s46
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, s98
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, s96
+; SI-NEXT: v_readlane_b32 s4, v62, 14
+; SI-NEXT: v_mov_b32_e32 v60, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 15
+; SI-NEXT: v_mov_b32_e32 v31, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 16
+; SI-NEXT: v_mov_b32_e32 v32, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 17
+; SI-NEXT: v_mov_b32_e32 v18, s5
+; SI-NEXT: v_mov_b32_e32 v46, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 0
+; SI-NEXT: v_readlane_b32 s5, v61, 1
+; SI-NEXT: v_mov_b32_e32 v59, s17
+; SI-NEXT: v_mov_b32_e32 v58, s16
+; SI-NEXT: v_mov_b32_e32 v45, s19
+; SI-NEXT: v_mov_b32_e32 v44, s18
+; SI-NEXT: v_mov_b32_e32 v53, s21
+; SI-NEXT: v_mov_b32_e32 v52, s20
+; SI-NEXT: v_mov_b32_e32 v39, s23
+; SI-NEXT: v_mov_b32_e32 v38, s22
+; SI-NEXT: v_mov_b32_e32 v24, s25
+; SI-NEXT: v_mov_b32_e32 v23, s24
+; SI-NEXT: v_mov_b32_e32 v22, s27
+; SI-NEXT: v_mov_b32_e32 v21, s26
+; SI-NEXT: v_mov_b32_e32 v20, s29
+; SI-NEXT: v_mov_b32_e32 v19, s28
+; SI-NEXT: v_mov_b32_e32 v16, s7
+; SI-NEXT: v_mov_b32_e32 v15, s6
+; SI-NEXT: v_mov_b32_e32 v14, s9
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s86
+; SI-NEXT: v_mov_b32_e32 v13, s8
+; SI-NEXT: v_mov_b32_e32 v12, s11
+; SI-NEXT: v_mov_b32_e32 v11, s10
+; SI-NEXT: v_mov_b32_e32 v10, s13
+; SI-NEXT: v_mov_b32_e32 v9, s12
+; SI-NEXT: v_mov_b32_e32 v8, s15
+; SI-NEXT: v_mov_b32_e32 v7, s14
+; SI-NEXT: v_mov_b32_e32 v6, s41
+; SI-NEXT: v_mov_b32_e32 v5, s40
+; SI-NEXT: v_mov_b32_e32 v4, s43
+; SI-NEXT: v_mov_b32_e32 v3, s42
+; SI-NEXT: v_mov_b32_e32 v2, s45
+; SI-NEXT: v_mov_b32_e32 v1, s44
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v28, s38
+; SI-NEXT: v_mov_b32_e32 v27, s36
+; SI-NEXT: v_mov_b32_e32 v26, s34
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v25, s30
+; SI-NEXT: v_mov_b32_e32 v56, s94
+; SI-NEXT: v_mov_b32_e32 v55, s92
+; SI-NEXT: v_mov_b32_e32 v54, s90
+; SI-NEXT: v_mov_b32_e32 v42, s88
+; SI-NEXT: v_mov_b32_e32 v41, s78
+; SI-NEXT: v_mov_b32_e32 v40, s76
+; SI-NEXT: v_mov_b32_e32 v50, s74
+; SI-NEXT: v_mov_b32_e32 v49, s72
+; SI-NEXT: v_mov_b32_e32 v48, s62
+; SI-NEXT: v_mov_b32_e32 v47, s60
+; SI-NEXT: v_mov_b32_e32 v36, s58
+; SI-NEXT: v_mov_b32_e32 v35, s56
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
@@ -117260,165 +117242,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v29, s50
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s4, v62, 14
-; SI-NEXT: v_mov_b32_e32 v60, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 15
-; SI-NEXT: v_mov_b32_e32 v31, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 16
-; SI-NEXT: v_mov_b32_e32 v32, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 17
-; SI-NEXT: v_mov_b32_e32 v18, s5
-; SI-NEXT: v_mov_b32_e32 v46, s4
-; SI-NEXT: v_readlane_b32 s4, v61, 0
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 2
+; SI-NEXT: v_readlane_b32 s5, v61, 3
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 1
-; SI-NEXT: v_readlane_b32 s4, v61, 2
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 4
+; SI-NEXT: v_readlane_b32 s5, v61, 5
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 3
-; SI-NEXT: v_readlane_b32 s4, v61, 4
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 6
+; SI-NEXT: v_readlane_b32 s5, v61, 7
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 5
-; SI-NEXT: v_readlane_b32 s4, v61, 6
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 8
+; SI-NEXT: v_readlane_b32 s5, v61, 9
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 7
-; SI-NEXT: v_readlane_b32 s4, v61, 8
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 10
+; SI-NEXT: v_readlane_b32 s5, v61, 11
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 9
-; SI-NEXT: v_readlane_b32 s4, v61, 10
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 12
+; SI-NEXT: v_readlane_b32 s5, v61, 13
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 11
-; SI-NEXT: v_readlane_b32 s4, v61, 12
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 14
+; SI-NEXT: v_readlane_b32 s5, v61, 15
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 13
-; SI-NEXT: v_readlane_b32 s4, v61, 14
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 16
+; SI-NEXT: v_readlane_b32 s5, v61, 17
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 15
-; SI-NEXT: v_readlane_b32 s4, v61, 16
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 18
+; SI-NEXT: v_readlane_b32 s5, v61, 19
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 17
-; SI-NEXT: v_readlane_b32 s4, v61, 18
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 20
+; SI-NEXT: v_readlane_b32 s5, v61, 21
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 19
-; SI-NEXT: v_readlane_b32 s4, v61, 20
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 22
+; SI-NEXT: v_readlane_b32 s5, v61, 23
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 21
-; SI-NEXT: v_readlane_b32 s4, v61, 22
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 24
+; SI-NEXT: v_readlane_b32 s5, v61, 25
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 23
-; SI-NEXT: v_readlane_b32 s4, v61, 24
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 26
+; SI-NEXT: v_readlane_b32 s5, v61, 27
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 25
-; SI-NEXT: v_readlane_b32 s4, v61, 26
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 28
+; SI-NEXT: v_readlane_b32 s5, v61, 29
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 27
-; SI-NEXT: v_readlane_b32 s4, v61, 28
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 30
+; SI-NEXT: v_readlane_b32 s5, v61, 31
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 29
-; SI-NEXT: v_readlane_b32 s4, v61, 30
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 32
+; SI-NEXT: v_readlane_b32 s5, v61, 33
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s48
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 31
-; SI-NEXT: v_readlane_b32 s4, v61, 32
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
-; SI-NEXT: v_mov_b32_e32 v59, s17
-; SI-NEXT: v_mov_b32_e32 v58, s16
-; SI-NEXT: v_mov_b32_e32 v45, s19
-; SI-NEXT: v_mov_b32_e32 v44, s18
-; SI-NEXT: v_mov_b32_e32 v53, s21
-; SI-NEXT: v_mov_b32_e32 v52, s20
-; SI-NEXT: v_mov_b32_e32 v39, s23
-; SI-NEXT: v_mov_b32_e32 v38, s22
-; SI-NEXT: v_mov_b32_e32 v24, s25
-; SI-NEXT: v_mov_b32_e32 v23, s24
-; SI-NEXT: v_mov_b32_e32 v22, s27
-; SI-NEXT: v_mov_b32_e32 v21, s26
-; SI-NEXT: v_mov_b32_e32 v20, s29
-; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_mov_b32_e32 v16, s7
-; SI-NEXT: v_mov_b32_e32 v15, s6
-; SI-NEXT: v_mov_b32_e32 v14, s9
-; SI-NEXT: v_mov_b32_e32 v13, s8
-; SI-NEXT: v_mov_b32_e32 v12, s11
-; SI-NEXT: v_mov_b32_e32 v11, s10
-; SI-NEXT: v_mov_b32_e32 v10, s13
-; SI-NEXT: v_mov_b32_e32 v9, s12
-; SI-NEXT: v_mov_b32_e32 v8, s15
-; SI-NEXT: v_mov_b32_e32 v7, s14
-; SI-NEXT: v_mov_b32_e32 v6, s41
-; SI-NEXT: v_mov_b32_e32 v5, s40
-; SI-NEXT: v_mov_b32_e32 v4, s43
-; SI-NEXT: v_mov_b32_e32 v3, s42
-; SI-NEXT: v_mov_b32_e32 v2, s45
-; SI-NEXT: v_mov_b32_e32 v1, s44
-; SI-NEXT: v_mov_b32_e32 v28, s38
-; SI-NEXT: v_mov_b32_e32 v27, s36
-; SI-NEXT: v_mov_b32_e32 v26, s34
-; SI-NEXT: v_mov_b32_e32 v25, s30
-; SI-NEXT: v_mov_b32_e32 v56, s94
-; SI-NEXT: v_mov_b32_e32 v55, s92
-; SI-NEXT: v_mov_b32_e32 v54, s90
-; SI-NEXT: v_mov_b32_e32 v42, s88
-; SI-NEXT: v_mov_b32_e32 v41, s78
-; SI-NEXT: v_mov_b32_e32 v40, s76
-; SI-NEXT: v_mov_b32_e32 v50, s74
-; SI-NEXT: v_mov_b32_e32 v49, s72
-; SI-NEXT: v_mov_b32_e32 v48, s62
-; SI-NEXT: v_mov_b32_e32 v47, s60
-; SI-NEXT: v_mov_b32_e32 v36, s58
-; SI-NEXT: v_mov_b32_e32 v35, s56
-; SI-NEXT: v_readlane_b32 s5, v61, 33
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: .LBB73_5: ; %end
@@ -117711,9 +117636,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; SI-NEXT: v_or_b32_e32 v17, v17, v18
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
@@ -118002,15 +117927,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -118024,6 +117940,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v51
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -118690,6 +118615,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 11
; VI-NEXT: v_mov_b32_e32 v41, s4
+; VI-NEXT: v_mov_b32_e32 v40, s48
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s38
; VI-NEXT: v_readlane_b32 s4, v62, 12
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
@@ -118727,6 +118656,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 25
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s36
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -118764,6 +118696,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 37
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s34
; VI-NEXT: v_readlane_b32 s4, v62, 38
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
@@ -118779,52 +118714,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 42
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
-; VI-NEXT: v_mov_b32_e32 v40, s48
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s38
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s36
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s34
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s30
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s90
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s88
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s78
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s76
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s74
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s72
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s62
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s60
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s58
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s56
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 43
; VI-NEXT: v_mov_b32_e32 v53, s4
; VI-NEXT: v_readlane_b32 s4, v62, 44
@@ -118834,6 +118723,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 46
; VI-NEXT: v_mov_b32_e32 v58, s4
; VI-NEXT: v_readlane_b32 s4, v62, 47
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 48
; VI-NEXT: v_mov_b32_e32 v54, s4
@@ -118846,17 +118736,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 52
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 53
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s30
; VI-NEXT: v_mov_b32_e32 v49, s4
; VI-NEXT: v_readlane_b32 s4, v62, 54
; VI-NEXT: v_mov_b32_e32 v61, s4
; VI-NEXT: v_readlane_b32 s4, v62, 55
; VI-NEXT: v_mov_b32_e32 v36, s4
; VI-NEXT: v_readlane_b32 s4, v62, 56
-; VI-NEXT: v_mov_b32_e32 v40, s46
; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v12, s5
; VI-NEXT: v_mov_b32_e32 v1, s44
; VI-NEXT: v_mov_b32_e32 v2, s45
@@ -118886,13 +118776,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v28, s21
; VI-NEXT: v_mov_b32_e32 v29, s18
; VI-NEXT: v_mov_b32_e32 v30, s19
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s90
; VI-NEXT: v_mov_b32_e32 v31, s16
; VI-NEXT: v_mov_b32_e32 v32, s17
; VI-NEXT: v_mov_b32_e32 v42, s70
; VI-NEXT: v_mov_b32_e32 v50, s4
-; VI-NEXT: v_mov_b32_e32 v40, v43
; VI-NEXT: v_mov_b32_e32 v46, v38
; VI-NEXT: v_mov_b32_e32 v38, v34
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s88
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s78
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s76
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s74
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s72
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s62
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s60
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s58
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s56
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s46
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, v43
; VI-NEXT: .LBB73_5: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42
; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -119291,10 +119216,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -119309,6 +119231,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -119906,6 +119831,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $sgpr46
; GFX9-NEXT: s_branch .LBB73_2
; GFX9-NEXT: .LBB73_4:
+; GFX9-NEXT: v_mov_b32_e32 v41, s66
+; GFX9-NEXT: v_mov_b32_e32 v40, s36
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s34
; GFX9-NEXT: v_mov_b32_e32 v15, s81
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s71
@@ -119982,6 +119913,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 9
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s30
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 10
@@ -120040,71 +119975,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 28
; GFX9-NEXT: v_mov_b32_e32 v29, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 29
-; GFX9-NEXT: v_mov_b32_e32 v41, s66
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
-; GFX9-NEXT: v_mov_b32_e32 v40, s36
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s34
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s30
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s94
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s92
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s90
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s88
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s78
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s76
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s74
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s72
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s62
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s60
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s58
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s56
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_readlane_b32 s4, v62, 30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 31
; GFX9-NEXT: v_mov_b32_e32 v44, s4
@@ -120119,6 +119993,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 36
; GFX9-NEXT: v_mov_b32_e32 v55, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 37
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s94
; GFX9-NEXT: v_mov_b32_e32 v61, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 38
; GFX9-NEXT: v_mov_b32_e32 v42, s4
@@ -120143,7 +120021,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 48
; GFX9-NEXT: v_mov_b32_e32 v60, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 49
-; GFX9-NEXT: v_mov_b32_e32 v40, s46
; GFX9-NEXT: v_mov_b32_e32 v12, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s44
; GFX9-NEXT: v_mov_b32_e32 v2, s45
@@ -120181,6 +120058,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v54, s64
; GFX9-NEXT: v_mov_b32_e32 v52, s54
; GFX9-NEXT: v_mov_b32_e32 v25, s4
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s92
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s90
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s88
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s78
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s76
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s74
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s72
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s62
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s60
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s58
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s56
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s46
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -120202,6 +120127,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45
; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56
; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22
@@ -120252,46 +120179,45 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44
; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59
; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120305,9 +120231,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
@@ -120319,9 +120247,23 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32
@@ -120343,10 +120285,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120574,20 +120513,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -120599,7 +120524,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88
@@ -120634,7 +120559,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: v_writelane_b32 v77, s101, 5
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x13
+; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
@@ -121542,7 +121467,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x13
+; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8
@@ -121605,7 +121530,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: v_readlane_b32 s31, v76, 1
; GFX11-NEXT: v_readlane_b32 s30, v76, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88
@@ -121818,13 +121743,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -121985,44 +121923,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB74_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -122031,11 +121955,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -122638,7 +122562,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB74_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -122652,8 +122575,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -123333,13 +123256,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -123476,34 +123411,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -123989,7 +123910,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -124567,13 +124487,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -124715,34 +124649,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -125229,7 +125149,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -126368,7 +126287,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -126401,7 +126320,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -127342,7 +127261,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB74_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -127375,7 +127294,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -128092,24 +128011,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB75_3
; SI-NEXT: .LBB75_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -128120,10 +128028,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB75_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -128133,7 +128053,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -128728,13 +128647,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -128962,11 +128881,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -128976,11 +128895,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -128988,6 +128904,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -129196,12 +129114,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB75_3
; VI-NEXT: .LBB75_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -129219,6 +129131,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -129228,7 +129146,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -129241,7 +129159,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -129826,8 +129743,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -130006,16 +129923,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -130042,9 +129961,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -130060,14 +129978,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -130079,10 +129999,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130095,10 +130016,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -130112,17 +130035,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -130138,45 +130066,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130227,18 +130134,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB75_3
; GFX9-NEXT: .LBB75_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -130252,6 +130147,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -130689,7 +130596,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -130722,7 +130629,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -131579,7 +131486,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -131612,7 +131519,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -131630,7 +131537,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -131663,7 +131570,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -132520,7 +132427,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -132553,7 +132460,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -132588,22 +132495,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -132672,6 +132563,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
@@ -132703,7 +132610,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -132713,7 +132620,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -132843,7 +132750,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
@@ -133081,10 +132987,25 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
@@ -133263,22 +133184,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -133966,8 +133871,22 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -134055,20 +133974,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -136071,7 +135976,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -136088,9 +135996,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -136656,7 +136561,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -139123,7 +139028,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -139156,7 +139061,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -139189,7 +139094,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -139991,7 +139896,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -140024,7 +139929,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -140057,7 +139962,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -140097,7 +140002,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -140130,7 +140035,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -140163,7 +140068,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -140855,7 +140760,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -140888,7 +140793,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -140921,7 +140826,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -140978,22 +140883,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -141062,6 +140951,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr47
@@ -141093,7 +140998,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -141144,7 +141049,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v52
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22
@@ -141314,7 +141218,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9
@@ -141662,8 +141565,24 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -141712,22 +141631,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -142372,6 +142275,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -142391,22 +142310,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB81_4:
@@ -144567,7 +144470,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -144600,7 +144503,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -144633,7 +144536,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -144711,7 +144614,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -144744,7 +144647,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -144777,7 +144680,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -144835,6 +144738,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -144851,10 +144758,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -144886,14 +144789,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB84_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -144937,7 +144839,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB84_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0
; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
@@ -145149,6 +145050,22 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
@@ -145175,7 +145092,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -145188,22 +145105,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -145607,7 +145508,23 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
@@ -145668,22 +145585,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB85_4:
@@ -146031,7 +145932,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -146063,12 +145970,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -146084,7 +145985,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -146301,7 +146201,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -147475,7 +147374,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -147508,7 +147407,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -147541,7 +147440,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -147619,7 +147518,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -147652,7 +147551,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -147685,7 +147584,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -147895,6 +147794,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
@@ -147904,7 +147805,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147944,38 +147845,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200
@@ -147991,11 +147893,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240
-; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -148017,14 +147920,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316
@@ -148032,11 +147927,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340
@@ -148045,9 +147944,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372
@@ -148057,7 +147958,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1
@@ -149557,10 +149458,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -149857,22 +149773,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -149940,8 +149840,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -150037,13 +149937,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -150171,14 +150083,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150186,26 +150103,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -150214,35 +150111,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150275,39 +150194,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150473,17 +150372,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -151168,8 +151059,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -151280,13 +151171,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -151419,14 +151324,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151434,26 +151344,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -151462,36 +151352,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151514,49 +151430,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151722,17 +151614,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -153078,7 +152962,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -153111,7 +152995,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -153940,7 +153824,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB88_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -153973,7 +153857,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -154018,7 +153902,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -154029,7 +153912,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_mov_b32 s72, s21
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s19, 0
; SI-NEXT: v_writelane_b32 v43, s18, 1
; SI-NEXT: v_writelane_b32 v43, s17, 2
@@ -154070,10 +153953,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
+; SI-NEXT: s_mov_b32 s79, s26
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
-; SI-NEXT: s_mov_b32 s79, s26
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
; SI-NEXT: v_readfirstlane_b32 s38, v20
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s39, v19
@@ -154100,9 +153989,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
; SI-NEXT: v_readfirstlane_b32 s88, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s6, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
@@ -154110,33 +153997,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 4
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
; SI-NEXT: v_writelane_b32 v43, s4, 5
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 6
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 7
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v36
; SI-NEXT: v_writelane_b32 v43, s4, 8
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
; SI-NEXT: v_writelane_b32 v43, s4, 9
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 10
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: v_readfirstlane_b32 s90, v9
; SI-NEXT: v_readfirstlane_b32 s91, v10
; SI-NEXT: v_readfirstlane_b32 s92, v8
; SI-NEXT: v_readfirstlane_b32 s93, v7
@@ -154219,44 +154104,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s24, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s78, v34
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
; SI-NEXT: v_writelane_b32 v43, s4, 19
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: v_writelane_b32 v43, s4, 20
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 21
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 22
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
; SI-NEXT: v_writelane_b32 v43, s4, 23
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s4, v48
; SI-NEXT: v_writelane_b32 v43, s4, 24
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s4, v49
; SI-NEXT: v_writelane_b32 v43, s4, 25
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v50
; SI-NEXT: v_writelane_b32 v43, s4, 26
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: v_writelane_b32 v43, s4, 27
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -154270,7 +154152,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
; SI-NEXT: v_writelane_b32 v43, s4, 28
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v52
; SI-NEXT: v_writelane_b32 v43, s4, 29
; SI-NEXT: v_readfirstlane_b32 s4, v53
@@ -154279,7 +154161,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v43, s4, 31
; SI-NEXT: v_readfirstlane_b32 s4, v55
; SI-NEXT: v_writelane_b32 v43, s4, 32
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v40
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_writelane_b32 v43, s22, 34
@@ -155894,33 +155775,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -155965,52 +155866,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -156030,6 +155885,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -156038,7 +155894,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -156070,6 +155925,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB89_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -156094,15 +155968,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -156152,10 +156029,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -156163,50 +156041,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -156221,13 +156086,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -156249,21 +156113,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -156281,11 +156152,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -156318,7 +156188,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_branch .LBB89_3
; VI-NEXT: .LBB89_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -156339,6 +156208,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -156930,29 +156800,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -157016,82 +156908,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -157112,6 +156964,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB89_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -157365,14 +157224,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -157382,7 +157240,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: .LBB89_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -157394,6 +157251,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -157859,7 +157717,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -158589,7 +158447,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -158631,7 +158489,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -159415,7 +159273,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -161437,7 +161295,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v10
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
@@ -161484,28 +161358,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v64bf16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -161522,9 +161383,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr34
@@ -161713,166 +161571,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11
; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v46, v63
; VI-NEXT: v_mov_b32_e32 v63, v50
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; VI-NEXT: v_mov_b32_e32 v51, v57
; VI-NEXT: v_mov_b32_e32 v50, v56
; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; VI-NEXT: v_mov_b32_e32 v57, v43
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18]
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
@@ -161885,6 +161742,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
@@ -162518,27 +162376,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
+; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
@@ -162923,9 +162781,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -163020,16 +162879,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -163046,6 +162895,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
+; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -163282,49 +163141,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -163338,6 +163159,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62
@@ -163355,130 +163177,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT: s_waitcnt vmcnt(35)
+; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -163571,16 +163431,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_mov_b32_e32 v59, v32
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT: v_mov_b32_e32 v58, v31
; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
@@ -163735,7 +163590,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: s_waitcnt vmcnt(52)
+; GFX9-NEXT: s_waitcnt vmcnt(50)
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62
; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc
@@ -163750,7 +163605,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
@@ -163891,8 +163745,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT: v_mov_b32_e32 v59, v32
; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_mov_b32_e32 v58, v31
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -163958,6 +163814,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc
; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6
@@ -163965,7 +163822,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc
; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1
@@ -163994,24 +163851,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc
+; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7
; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -164031,12 +163878,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23
+; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22
; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7
@@ -164045,6 +163899,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7
; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7
@@ -164052,12 +163924,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57
-; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -164101,7 +163969,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56
; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
@@ -164134,74 +164001,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53
@@ -164214,15 +164058,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44]
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
@@ -164231,6 +164086,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -164255,31 +164113,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v63, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v63, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: v_mov_b32_e32 v62, v15
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43
+; GFX9-NEXT: s_waitcnt vmcnt(24)
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(19)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60
@@ -164294,6 +164154,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54
; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32
; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -164302,6 +164166,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39
; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51
@@ -164310,38 +164176,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11
; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13
; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43
; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60
; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31
; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -164516,13 +164373,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55
-; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55
+; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80
@@ -164533,13 +164390,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57
-; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57
+; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88
@@ -164679,7 +164536,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240
@@ -164712,7 +164573,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112
@@ -164741,10 +164602,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
@@ -165778,7 +165635,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20
@@ -165811,7 +165668,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136
-; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140
; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144
; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148
@@ -165846,7 +165703,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x15
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88
@@ -165869,10 +165730,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -166991,7 +166848,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x15
+; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20
@@ -168663,13 +168520,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v1, 0xff, v46
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_lshl_b32 s5, s86, 24
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s86, v63, 30
; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -168708,20 +168578,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -169924,6 +169780,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v43, s4
; VI-NEXT: v_readlane_b32 s4, v62, 13
; VI-NEXT: v_mov_b32_e32 v46, s4
+; VI-NEXT: v_mov_b32_e32 v45, s72
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v45, s74
+; VI-NEXT: v_mov_b32_e32 v42, s54
+; VI-NEXT: v_mov_b32_e32 v41, s46
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s56
; VI-NEXT: v_readlane_b32 s4, v62, 14
; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 15
@@ -169949,6 +169814,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s4, v62, 22
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v45, s76
; VI-NEXT: v_readlane_b32 s4, v62, 23
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
@@ -169994,6 +169864,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s4, v62, 37
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 38
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
@@ -170052,45 +169924,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: v_mov_b32_e32 v42, s54
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_mov_b32_e32 v41, s46
+; VI-NEXT: v_mov_b32_e32 v36, s66
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s56
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s58
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s60
-; VI-NEXT: v_mov_b32_e32 v45, s72
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s74
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s76
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v45, s78
; VI-NEXT: v_mov_b32_e32 v55, s88
+; VI-NEXT: v_mov_b32_e32 v35, s30
+; VI-NEXT: v_mov_b32_e32 v41, s58
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v36, s66
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v35, s85
+; VI-NEXT: v_mov_b32_e32 v34, s38
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s64
-; VI-NEXT: v_mov_b32_e32 v55, v50
-; VI-NEXT: v_mov_b32_e32 v35, s30
; VI-NEXT: v_mov_b32_e32 v59, s87
+; VI-NEXT: v_mov_b32_e32 v41, s60
+; VI-NEXT: v_mov_b32_e32 v55, v50
; VI-NEXT: v_mov_b32_e32 v58, s34
; VI-NEXT: v_mov_b32_e32 v45, s36
-; VI-NEXT: v_mov_b32_e32 v34, s38
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, v46
+; VI-NEXT: v_mov_b32_e32 v46, v48
+; VI-NEXT: v_mov_b32_e32 v48, v47
+; VI-NEXT: v_mov_b32_e32 v47, v56
+; VI-NEXT: v_mov_b32_e32 v56, v51
+; VI-NEXT: v_mov_b32_e32 v51, s90
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v34, s48
; VI-NEXT: v_mov_b32_e32 v1, s44
; VI-NEXT: v_mov_b32_e32 v2, s45
; VI-NEXT: v_mov_b32_e32 v3, s42
@@ -170123,37 +169997,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v30, s29
; VI-NEXT: v_mov_b32_e32 v32, s5
; VI-NEXT: v_mov_b32_e32 v41, s62
+; VI-NEXT: v_mov_b32_e32 v51, v53
+; VI-NEXT: v_mov_b32_e32 v53, v54
+; VI-NEXT: v_mov_b32_e32 v54, v40
+; VI-NEXT: v_mov_b32_e32 v40, s80
; VI-NEXT: v_mov_b32_e32 v57, s81
; VI-NEXT: v_mov_b32_e32 v37, s84
+; VI-NEXT: v_mov_b32_e32 v58, s50
; VI-NEXT: v_mov_b32_e32 v60, s52
; VI-NEXT: v_mov_b32_e32 v38, s51
; VI-NEXT: v_mov_b32_e32 v61, s65
; VI-NEXT: v_mov_b32_e32 v49, s66
-; VI-NEXT: v_mov_b32_e32 v39, s55
-; VI-NEXT: v_mov_b32_e32 v50, v46
-; VI-NEXT: v_mov_b32_e32 v46, v48
-; VI-NEXT: v_mov_b32_e32 v48, v47
-; VI-NEXT: v_mov_b32_e32 v47, v56
-; VI-NEXT: v_mov_b32_e32 v56, v51
-; VI-NEXT: v_mov_b32_e32 v51, s90
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v35, s85
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v34, s48
-; VI-NEXT: v_mov_b32_e32 v51, v53
-; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: v_mov_b32_e32 v54, v40
-; VI-NEXT: v_mov_b32_e32 v40, s80
-; VI-NEXT: v_mov_b32_e32 v58, s50
; VI-NEXT: v_mov_b32_e32 v45, s53
+; VI-NEXT: v_mov_b32_e32 v39, s55
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: .LBB91_5: ; %end
@@ -170462,9 +170318,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170542,9 +170399,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170562,20 +170433,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -172164,11 +172021,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -172183,6 +172036,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -172194,7 +172051,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
@@ -173744,7 +173601,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
@@ -173757,7 +173614,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
@@ -175314,7 +175171,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
@@ -175488,9 +175345,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38
@@ -175508,6 +175362,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
@@ -175525,15 +175382,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208
@@ -175669,34 +175526,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175716,7 +175576,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120
@@ -175726,7 +175589,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216
@@ -175752,14 +175617,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
@@ -175882,7 +175739,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v2, 0xff, v47
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xff, v42
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -176540,25 +176396,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
; SI-NEXT: v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
@@ -177265,9 +177114,24 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -177574,22 +177438,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -177657,8 +177505,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -177754,13 +177602,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -177888,14 +177748,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -177903,26 +177768,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -177931,35 +177776,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -177992,39 +177859,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -178190,17 +178037,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -178885,8 +178724,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -178997,13 +178836,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -179136,14 +178989,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179151,26 +179009,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -179179,36 +179017,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179231,49 +179095,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179439,17 +179279,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -180795,7 +180627,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -180828,7 +180660,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -181657,7 +181489,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB92_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -181690,7 +181522,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -183232,17 +183064,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v60
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
@@ -183256,6 +183077,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
@@ -183515,33 +183347,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -183586,52 +183438,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -183651,6 +183457,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -183659,7 +183466,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -183691,6 +183497,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB93_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -183715,15 +183540,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -183773,10 +183601,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -183784,50 +183613,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -183842,13 +183658,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -183870,21 +183685,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -183902,11 +183724,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -183939,7 +183760,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_branch .LBB93_3
; VI-NEXT: .LBB93_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -183960,6 +183780,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -184551,29 +184372,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -184637,82 +184480,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -184733,6 +184536,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB93_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -184986,14 +184796,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -185003,7 +184812,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: .LBB93_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -185015,6 +184823,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -185480,7 +185289,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -186210,7 +186019,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -186252,7 +186061,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -187036,7 +186845,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -189053,13 +188862,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v45
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -189076,6 +188878,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -189098,27 +188907,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v12, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v32, v20
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4
-; VI-NEXT: v_mov_b32_e32 v32, v20
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, v22
; VI-NEXT: v_mov_b32_e32 v54, v21
; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43
; VI-NEXT: ; implicit-def: $vgpr20
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr8
+; VI-NEXT: ; implicit-def: $vgpr4
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; implicit-def: $vgpr63
@@ -189130,47 +188954,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT: ; implicit-def: $vgpr15
; VI-NEXT: ; implicit-def: $vgpr13
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr30
+; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
@@ -189179,38 +188994,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr29
; VI-NEXT: ; implicit-def: $vgpr28
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT: ; implicit-def: $vgpr27
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr24
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr23
-; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr0
@@ -189254,8 +189037,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: ; kill: killed $vgpr0
; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; VI-NEXT: ; kill: killed $vgpr0
; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr5
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT: ; implicit-def: $vgpr27
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr25
+; VI-NEXT: ; implicit-def: $vgpr24
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr23
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr10
@@ -189293,28 +189102,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v56, v38
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v45, v7
-; VI-NEXT: v_mov_b32_e32 v63, v53
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, v3
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v28, v48
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v16, v40
; VI-NEXT: v_mov_b32_e32 v47, v39
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v63, v53
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44
; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37
@@ -189326,83 +189156,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v62, v36
-; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53
-; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v62, v36
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38]
@@ -189417,61 +189184,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36
-; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36]
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33
+; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
+; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53
+; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27
+; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34
; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53]
; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27
; VI-NEXT: v_mov_b32_e32 v53, v63
-; VI-NEXT: v_mov_b32_e32 v27, v19
-; VI-NEXT: v_mov_b32_e32 v34, v14
-; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40
+; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6
; VI-NEXT: v_mov_b32_e32 v7, v45
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT: v_mov_b32_e32 v3, v15
-; VI-NEXT: v_mov_b32_e32 v15, v29
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT: v_mov_b32_e32 v38, v56
-; VI-NEXT: v_mov_b32_e32 v29, v41
; VI-NEXT: v_mov_b32_e32 v45, v60
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(13)
+; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3]
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27]
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36
+; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35
; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50
-; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36]
; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50]
; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40]
; VI-NEXT: v_mov_b32_e32 v58, v51
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34]
; VI-NEXT: v_mov_b32_e32 v36, v62
; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55]
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40
-; VI-NEXT: v_mov_b32_e32 v40, v16
-; VI-NEXT: v_mov_b32_e32 v16, v48
-; VI-NEXT: v_mov_b32_e32 v48, v28
-; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v34, v14
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v40, v16
+; VI-NEXT: v_mov_b32_e32 v16, v48
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT: v_mov_b32_e32 v3, v15
+; VI-NEXT: v_mov_b32_e32 v15, v29
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v38, v56
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39
+; VI-NEXT: v_mov_b32_e32 v29, v41
; VI-NEXT: v_mov_b32_e32 v39, v47
; VI-NEXT: v_mov_b32_e32 v47, v4
; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55
; VI-NEXT: .LBB94_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB94_4
; VI-NEXT: ; %bb.3: ; %cmp.true
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v63, 0x200
; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21
@@ -189490,36 +189290,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_add_f16_e32 v31, 0x200, v31
; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_e32 v14, v31, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23
; VI-NEXT: v_add_f16_e32 v55, 0x200, v55
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_e32 v62, v55, v0
; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
; VI-NEXT: v_or_b32_e32 v61, v54, v0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v26, v54
; VI-NEXT: v_mov_b32_e32 v27, v55
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
; VI-NEXT: v_or_b32_e32 v34, v25, v0
; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v33, v24, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -189527,13 +189338,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v36, v2, v0
; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v35, v1, v0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -189542,38 +189361,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v38, v2, v0
; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v37, v1, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_or_b32_e32 v49, v9, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT: v_or_b32_e32 v49, v9, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v48, v8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v31
; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v10, v32
@@ -189591,11 +189406,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v53, v2, v0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v44, 0x200, v44
; VI-NEXT: v_or_b32_e32 v52, v1, v0
@@ -189612,28 +189427,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v46, v2, v0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; VI-NEXT: v_or_b32_e32 v45, v1, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v5, v7, v0
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v4, v6, v0
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
@@ -189641,36 +189460,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v41, v7, v0
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v40, v6, v0
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42
; VI-NEXT: v_or_b32_e32 v7, v25, v0
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46
; VI-NEXT: v_or_b32_e32 v6, v24, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
@@ -189679,7 +189475,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v31, v43, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28
; VI-NEXT: v_or_b32_e32 v30, v2, v0
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v55, 0x200, v55
; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -189695,8 +189490,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30
@@ -189714,21 +189507,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; VI-NEXT: v_mov_b32_e32 v32, v10
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
-; VI-NEXT: v_mov_b32_e32 v32, v10
; VI-NEXT: v_mov_b32_e32 v31, v9
; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
; VI-NEXT: v_mov_b32_e32 v7, v11
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41]
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, v27
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v54, v26
; VI-NEXT: v_mov_b32_e32 v26, v20
; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5
@@ -189736,23 +189529,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v5, v22
; VI-NEXT: v_mov_b32_e32 v13, v21
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46]
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51]
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49]
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36
@@ -189760,27 +189544,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36]
; VI-NEXT: v_mov_b32_e32 v36, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61
-; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49
; VI-NEXT: v_mov_b32_e32 v48, v56
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33
; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34]
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15]
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61
+; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62]
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v23
; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v14, v8
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v40, v42
; VI-NEXT: v_bfe_u32 v8, v42, 8, 8
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38
; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37
; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38]
@@ -189797,26 +189593,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_bfe_u32 v51, v48, 8, 8
; VI-NEXT: v_bfe_u32 v57, v7, 8, 8
; VI-NEXT: v_bfe_u32 v58, v60, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_bfe_u32 v2, v2, 8, 8
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: v_bfe_u32 v34, v47, 8, 8
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v9, v9, 8, 8
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v5, v5, 8, 8
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v13, v13, 8, 8
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
+; VI-NEXT: v_bfe_u32 v2, v2, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_bfe_u32 v42, v0, 8, 8
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v47, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
@@ -189866,27 +189660,25 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189898,9 +189690,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12
@@ -189953,7 +189749,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15
; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189962,14 +189760,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12
@@ -189991,11 +189786,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12
@@ -190003,7 +189795,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -190053,7 +189847,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61
; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12
@@ -190067,12 +189863,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -190088,35 +189881,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12
@@ -190135,13 +189928,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -190161,12 +189951,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -190176,15 +189964,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12
-; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -190201,28 +189981,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12
+; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v64f16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -190285,6 +190057,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -190315,7 +190104,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -190349,7 +190137,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -190472,101 +190260,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(46)
+; GFX9-NEXT: s_waitcnt vmcnt(62)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
@@ -190582,6 +190369,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -190607,7 +190395,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -191148,17 +190936,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -191175,6 +190953,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -191633,7 +191423,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -191654,10 +191448,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -192293,7 +192083,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -193940,13 +193730,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: v_mov_b32_e32 v4, s6
+; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_and_b32 s6, s7, 0xff
; SI-NEXT: s_lshl_b32 s7, s51, 8
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s6, v1
@@ -193979,21 +193783,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s45, v62, 17
; SI-NEXT: v_readlane_b32 s43, v62, 23
; SI-NEXT: v_readlane_b32 s41, v62, 29
@@ -194001,6 +193790,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_readlane_b32 s27, v62, 41
; SI-NEXT: v_readlane_b32 s25, v62, 45
; SI-NEXT: v_readlane_b32 s9, v62, 49
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: v_readlane_b32 s99, v63, 35
; SI-NEXT: v_readlane_b32 s98, v63, 34
; SI-NEXT: v_readlane_b32 s97, v63, 33
@@ -194483,8 +194273,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10
@@ -194492,6 +194280,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13]
@@ -194499,12 +194288,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
@@ -194512,14 +194295,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16
; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19
; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18
; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19]
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -194554,6 +194343,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_bfe_u32 v11, v52, 8, 8
; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
@@ -195163,9 +194953,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27
; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -195174,13 +194966,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -195312,9 +195101,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -195382,20 +195184,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -195713,42 +195501,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26
@@ -196400,9 +196188,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -196434,10 +196224,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -196688,8 +196475,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24
; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -196704,6 +196489,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -196715,7 +196502,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -196750,7 +196537,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -197669,7 +197456,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -197731,7 +197518,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -197782,11 +197569,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: v_mov_b32_e32 v57, v5
; SI-NEXT: v_mov_b32_e32 v41, v3
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392
@@ -197876,7 +197663,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -197884,28 +197694,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32
; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr27
@@ -197913,240 +197716,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76
@@ -198158,15 +197932,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
@@ -198202,7 +197980,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
@@ -198682,15 +198460,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v25, v6, v13
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v5
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26
; SI-NEXT: v_or_b32_e32 v6, v6, v11
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: v_or_b32_e32 v5, v5, v8
; SI-NEXT: s_waitcnt expcnt(0)
@@ -199918,14 +199696,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -199942,6 +199714,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -200009,8 +199787,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -200106,13 +199884,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -200240,14 +200030,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200255,26 +200050,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -200283,35 +200058,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200344,39 +200141,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200542,17 +200319,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -201237,8 +201006,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -201349,13 +201118,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -201488,14 +201271,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201503,26 +201291,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -201531,36 +201299,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201583,49 +201377,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201791,17 +201561,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -203147,7 +202909,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -203180,7 +202942,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -204009,7 +203771,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB96_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -204042,7 +203804,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -204087,7 +203849,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -204097,9 +203858,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_writelane_b32 v41, s30, 0
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
; SI-NEXT: v_writelane_b32 v43, s27, 2
@@ -204148,6 +203909,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s96, 32
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
; SI-NEXT: v_readfirstlane_b32 s39, v26
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s47, v12
@@ -204170,9 +203937,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s59, v28
; SI-NEXT: v_readfirstlane_b32 s60, v27
; SI-NEXT: v_readfirstlane_b32 s11, v1
-; SI-NEXT: v_readfirstlane_b32 s12, v2
-; SI-NEXT: v_readfirstlane_b32 s13, v9
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -204181,30 +203946,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 15
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
; SI-NEXT: v_writelane_b32 v43, s4, 16
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 17
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s44, v36
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s90, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s6, v38
+; SI-NEXT: v_readfirstlane_b32 s12, v2
+; SI-NEXT: v_readfirstlane_b32 s13, v9
; SI-NEXT: v_readfirstlane_b32 s14, v10
; SI-NEXT: v_readfirstlane_b32 s15, v8
; SI-NEXT: v_readfirstlane_b32 s18, v7
@@ -204218,6 +203981,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s77, v15
; SI-NEXT: v_readfirstlane_b32 s38, v25
; SI-NEXT: v_writelane_b32 v41, s99, 35
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_readfirstlane_b32 s93, v55
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_readfirstlane_b32 s95, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 19
@@ -204294,39 +204061,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s4, 30
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 31
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 32
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s9, v35
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_readfirstlane_b32 s10, v36
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 34
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 35
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
; SI-NEXT: v_writelane_b32 v43, s4, 36
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s69, v48
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s30, v49
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s16, v50
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s36, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -204340,7 +204103,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
; SI-NEXT: v_writelane_b32 v43, s4, 37
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v52
; SI-NEXT: v_writelane_b32 v43, s4, 38
; SI-NEXT: v_readfirstlane_b32 s4, v53
@@ -204367,9 +204130,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s43, 58
; SI-NEXT: v_writelane_b32 v43, s76, 59
; SI-NEXT: v_writelane_b32 v43, s77, 60
-; SI-NEXT: v_readfirstlane_b32 s93, v55
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s95, v40
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s17, v33
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -205938,33 +205698,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -206009,52 +205789,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -206074,6 +205808,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -206082,7 +205817,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -206114,6 +205848,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB97_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -206138,15 +205891,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -206196,10 +205952,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -206207,50 +205964,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -206265,13 +206009,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -206293,21 +206036,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -206325,11 +206075,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -206362,7 +206111,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_branch .LBB97_3
; VI-NEXT: .LBB97_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -206383,6 +206131,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -206974,29 +206723,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -207060,82 +206831,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -207156,6 +206887,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB97_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -207409,14 +207147,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -207426,7 +207163,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: .LBB97_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -207438,6 +207174,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -207903,7 +207640,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -208633,7 +208370,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -208675,7 +208412,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -209459,7 +209196,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -209562,100 +209299,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28
-; SI-NEXT: ; kill: killed $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
@@ -209785,14 +209428,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
@@ -209809,13 +209467,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
@@ -209870,12 +209521,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr18
@@ -209885,36 +209563,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
@@ -209936,6 +209659,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB98_2
@@ -211507,9 +211242,25 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v20
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -211533,44 +211284,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v64i16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32
@@ -211588,6 +211307,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
; VI-NEXT: ; kill: killed $vgpr35
; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; VI-NEXT: ; kill: killed $vgpr35
@@ -211884,14 +211619,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v8
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4
@@ -211923,10 +211656,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30
@@ -211997,10 +211726,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, v18
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, v46
; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26
; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24
; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22
@@ -212201,9 +211936,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15
; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v13, v41, v13
@@ -212211,38 +211943,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11
; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9
; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5
; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
@@ -212255,8 +211984,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
@@ -212325,6 +212052,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v49, v53
; VI-NEXT: v_mov_b32_e32 v53, v38
; VI-NEXT: v_mov_b32_e32 v38, v55
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
@@ -212336,6 +212064,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v55, v31
; VI-NEXT: v_bfe_u32 v61, v53, 8, 8
; VI-NEXT: v_bfe_u32 v31, v38, 8, 8
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: .LBB98_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -212743,9 +212478,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -212768,44 +212518,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v64i16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -212868,6 +212586,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -212898,7 +212633,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -212932,7 +212666,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -213055,101 +212789,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(46)
+; GFX9-NEXT: s_waitcnt vmcnt(62)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
@@ -213165,6 +212898,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -213189,7 +212923,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -213730,17 +213464,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -213757,6 +213481,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -214215,7 +213951,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -214236,10 +213976,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -214875,7 +214611,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -215014,26 +214750,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s91, v32
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s93, v33
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s55, v34
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s17, v35
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s95, v36
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s35, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s83, v38
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80
@@ -215046,39 +214782,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s39, v1
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s77, v31
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s38, v32
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s48, v33
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s50, v39
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s76, v48
+; SI-NEXT: v_readfirstlane_b32 s77, v31
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s30, v49
+; SI-NEXT: v_readfirstlane_b32 s38, v32
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s34, v50
+; SI-NEXT: v_readfirstlane_b32 s48, v33
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s36, v51
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s99, v34
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_readfirstlane_b32 s50, v39
; SI-NEXT: v_readfirstlane_b32 s90, v35
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s92, v36
; SI-NEXT: v_writelane_b32 v41, s90, 11
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s94, v37
; SI-NEXT: v_writelane_b32 v41, s92, 12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_readfirstlane_b32 s30, v49
; SI-NEXT: v_writelane_b32 v41, s94, 13
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_readfirstlane_b32 s34, v50
; SI-NEXT: v_writelane_b32 v41, s30, 14
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_readfirstlane_b32 s36, v51
; SI-NEXT: v_writelane_b32 v41, s34, 15
; SI-NEXT: v_writelane_b32 v41, s36, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_writelane_b32 v41, s38, 17
+; SI-NEXT: v_readfirstlane_b32 s76, v48
+; SI-NEXT: v_readfirstlane_b32 s99, v34
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_writelane_b32 v41, s48, 18
; SI-NEXT: v_writelane_b32 v41, s50, 19
@@ -218060,48 +217791,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10]
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26
@@ -218753,9 +218484,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -218787,10 +218520,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -219041,8 +218771,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24
; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -219057,6 +218785,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -219068,7 +218798,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -219103,7 +218833,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -220022,7 +219752,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -220084,7 +219814,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -221102,9 +220832,24 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -221449,28 +221194,14 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v64bf16_to_v64f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221487,9 +221218,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -221738,7 +221467,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -222104,6 +221832,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -222120,9 +221851,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -222341,7 +222070,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -222641,7 +222370,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -223201,7 +222930,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -224930,10 +224659,26 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -224966,22 +224711,6 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -228962,7 +228691,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47
-; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -228979,6 +228707,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -230321,20 +230050,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37
; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37
-; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37
-; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -230351,6 +230067,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37
+; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37
+; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -231398,17 +231127,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
@@ -231418,57 +231162,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33
; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
@@ -231526,31 +231276,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: .LBB104_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -231808,7 +231535,23 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
@@ -231833,28 +231576,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v64bf16_to_v64i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -231871,9 +231600,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -232122,7 +231849,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -232488,6 +232214,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -232504,9 +232233,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -232725,7 +232452,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -234330,15 +234057,21 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v57, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34
-; SI-NEXT: v_mov_b32_e32 v57, v13
; SI-NEXT: v_mov_b32_e32 v40, v3
; SI-NEXT: v_mov_b32_e32 v54, v50
; SI-NEXT: v_mov_b32_e32 v46, v19
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
; SI-NEXT: v_mov_b32_e32 v44, v15
; SI-NEXT: v_mov_b32_e32 v9, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59
@@ -234372,32 +234105,24 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v42, v43
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT: v_mov_b32_e32 v5, v19
+; SI-NEXT: v_mov_b32_e32 v7, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT: v_mov_b32_e32 v5, v19
-; SI-NEXT: v_mov_b32_e32 v7, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
@@ -234533,9 +234258,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v50
-; SI-NEXT: v_mov_b32_e32 v56, v47
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v53, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
@@ -234543,6 +234266,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v56, v47
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v40, v3
; SI-NEXT: v_mov_b32_e32 v44, v15
; SI-NEXT: v_mov_b32_e32 v57, v13
@@ -234850,16 +234575,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16
; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16
@@ -234917,19 +234644,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16
; SI-NEXT: .LBB105_5: ; %end
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -234955,10 +234679,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
@@ -234985,12 +234707,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
@@ -235034,7 +234755,25 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -235049,10 +234788,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
@@ -235067,10 +234806,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
@@ -235078,17 +234815,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
@@ -235096,17 +234834,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
@@ -235185,22 +234924,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -238415,7 +238138,23 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
@@ -238580,22 +238319,6 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -240180,38 +239903,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v12, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
; SI-NEXT: v_cvt_f16_f32_e32 v44, v4
; SI-NEXT: v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_cvt_f16_f32_e32 v4, v17
; SI-NEXT: v_cvt_f16_f32_e32 v13, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v24
@@ -240222,7 +239946,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v31, v27
; SI-NEXT: v_cvt_f16_f32_e32 v25, v50
; SI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v42
; SI-NEXT: v_cvt_f16_f32_e32 v21, v47
; SI-NEXT: v_cvt_f16_f32_e32 v22, v38
@@ -240804,16 +240527,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -240830,6 +240543,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -241300,10 +241023,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -241315,7 +241040,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v26
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT: v_mov_b32_e32 v54, v15
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT: v_mov_b32_e32 v12, v42
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -241325,8 +241067,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v26, v3, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
@@ -241335,39 +241082,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v22, v3, v5
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v18
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v54
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v18, v3, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v12
-; SI-NEXT: v_mov_b32_e32 v12, v42
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
@@ -241385,8 +241115,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v14, v3, v5
@@ -241430,11 +241158,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
@@ -241571,27 +241294,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_or_b32_e32 v12, v50, v1
; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16
-; SI-NEXT: v_mov_b32_e32 v35, v44
-; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16
; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16
-; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16
; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v35, v44
+; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16
+; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16
; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16
; SI-NEXT: v_mov_b32_e32 v42, v61
; SI-NEXT: v_mov_b32_e32 v61, v37
@@ -241685,17 +241408,18 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -241727,7 +241451,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
@@ -241742,10 +241468,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
@@ -241758,25 +241484,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
@@ -241805,9 +241530,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -241825,22 +241566,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -242878,9 +242603,24 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -243225,22 +242965,6 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -244323,15 +244047,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -244348,6 +244065,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9041f64..e688681 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -15670,8 +15670,25 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v3, 0xff, v29
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15681,7 +15698,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v55
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -15711,22 +15727,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -15947,16 +15947,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29
; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29
+; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@@ -21934,6 +21932,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; SI-NEXT: v_bfe_u32 v29, v1, 8, 8
; SI-NEXT: .LBB60_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32
; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
@@ -22052,14 +22058,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v4
; SI-NEXT: v_mov_b32_e32 v31, v2
; SI-NEXT: v_mov_b32_e32 v35, v0
@@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5
@@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
@@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v6, 0xff, v30
; SI-NEXT: v_or_b32_e32 v6, v6, v47
; SI-NEXT: v_cvt_f32_f16_e32 v15, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v50
; SI-NEXT: v_or_b32_e32 v6, v6, v56
; SI-NEXT: v_cvt_f32_f16_e32 v32, v6
@@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v59, v0
; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: s_movk_i32 s6, 0x300
; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v57, v0
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v56, v0
@@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee23420..39da45b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -8424,6 +8424,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -8459,22 +8475,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8757,6 +8757,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -8790,22 +8806,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9077,9 +9077,25 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9106,22 +9122,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -11440,11 +11440,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -11453,6 +11448,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -11484,7 +11484,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -11723,7 +11722,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -11972,11 +11970,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12016,16 +12014,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12035,6 +12026,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -12044,11 +12042,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12211,7 +12208,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12221,7 +12218,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12428,11 +12424,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12476,16 +12472,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12495,6 +12484,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -12504,11 +12500,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12671,7 +12666,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12681,7 +12676,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -17323,13 +17317,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -17352,9 +17346,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -23315,6 +23309,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -23350,22 +23360,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23648,6 +23642,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -23681,22 +23691,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23968,9 +23962,25 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -23997,22 +24007,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -25440,6 +25434,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_readlane_b32 s67, v63, 19
; VI-NEXT: v_readlane_b32 s66, v63, 18
; VI-NEXT: v_readlane_b32 s65, v63, 17
@@ -25460,7 +25469,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25490,21 +25499,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25873,6 +25867,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -25889,7 +25898,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25915,21 +25924,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -26452,11 +26446,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -26465,6 +26454,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -26496,7 +26490,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -26735,7 +26728,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -26984,11 +26976,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27028,16 +27020,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27047,6 +27032,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -27056,11 +27048,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27223,7 +27214,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27233,7 +27224,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27440,11 +27430,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27488,16 +27478,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27507,6 +27490,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -27516,11 +27506,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27683,7 +27672,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27693,7 +27682,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -31688,13 +31676,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -31717,9 +31705,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -37714,6 +37702,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -37749,22 +37753,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -38047,6 +38035,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -38080,22 +38084,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -38367,9 +38355,25 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -38396,22 +38400,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -40740,11 +40728,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -40753,6 +40736,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -40784,7 +40772,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -41023,7 +41010,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -41272,11 +41258,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41316,16 +41302,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41335,6 +41314,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -41344,11 +41330,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41511,7 +41496,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41521,7 +41506,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41728,11 +41712,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41776,16 +41760,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41795,6 +41772,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -41804,11 +41788,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41971,7 +41954,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41981,7 +41964,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -45317,13 +45299,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -45346,9 +45328,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -51165,6 +51147,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -51200,22 +51198,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -51490,6 +51472,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -51523,22 +51521,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -51802,9 +51784,25 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -51831,22 +51829,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -54188,11 +54170,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -54201,6 +54178,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -54232,7 +54214,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -54471,7 +54452,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -54720,11 +54700,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -54764,16 +54744,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -54783,6 +54756,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -54792,11 +54772,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -54959,7 +54938,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -54969,7 +54948,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55176,11 +55154,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -55224,16 +55202,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -55243,6 +55214,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -55252,11 +55230,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -55419,7 +55396,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55429,7 +55406,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -60580,6 +60556,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-LABEL: bitcast_v32bf16_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -60596,8 +60574,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
@@ -60661,9 +60637,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -64326,18 +64301,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62
-; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -64354,6 +64317,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -64471,44 +64446,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8
+; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7
+; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5
; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4
; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
@@ -64805,8 +64780,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -64823,6 +64796,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -65094,9 +65069,25 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -65123,22 +65114,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -67243,6 +67218,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -67259,7 +67249,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -67285,21 +67275,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -67768,17 +67743,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19
; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr55
@@ -67793,25 +67812,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32
; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38
; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr36
@@ -67819,7 +67837,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
@@ -67833,57 +67850,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -67892,7 +67860,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v21, 0xff, v58
; SI-NEXT: v_or_b32_e32 v21, v21, v26
; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21
@@ -68173,7 +68140,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB98_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -68198,7 +68164,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v3, v59, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8
@@ -68222,7 +68187,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -68430,8 +68394,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: .LBB98_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -68448,6 +68410,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, v37
; SI-NEXT: v_mov_b32_e32 v2, v48
@@ -68458,7 +68422,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_mov_b32_e32 v12, v32
; SI-NEXT: v_mov_b32_e32 v14, v51
; SI-NEXT: v_mov_b32_e32 v16, v34
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v18, v52
; SI-NEXT: v_mov_b32_e32 v20, v36
; SI-NEXT: v_mov_b32_e32 v22, v53
@@ -70196,13 +70159,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v46, v30
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44
@@ -70219,6 +70181,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: v_readfirstlane_b32 s42, v0
; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3
@@ -70242,19 +70205,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34
; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -70280,7 +70243,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v60, v44
; SI-NEXT: v_or_b32_e32 v44, v53, v9
; SI-NEXT: v_or_b32_e32 v33, v1, v44
@@ -70725,12 +70688,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: .LBB99_3: ; %end
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -70747,6 +70704,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_waitcnt expcnt(0)
@@ -70758,11 +70721,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_mov_b32_e32 v7, s11
; SI-NEXT: v_mov_b32_e32 v8, v37
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_mov_b32_e32 v10, v38
; SI-NEXT: v_mov_b32_e32 v12, v33
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v14, v34
; SI-NEXT: v_mov_b32_e32 v16, v48
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v18, v49
; SI-NEXT: v_mov_b32_e32 v20, v35
; SI-NEXT: v_mov_b32_e32 v22, v36
@@ -70770,7 +70735,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v26, v51
; SI-NEXT: v_mov_b32_e32 v28, v54
; SI-NEXT: v_mov_b32_e32 v30, v55
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB99_4:
; SI-NEXT: v_mov_b32_e32 v39, v32
@@ -72188,6 +72152,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v32f16_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -72204,8 +72170,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v33, v1
; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
@@ -72273,9 +72237,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v63, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr31
@@ -76994,8 +76957,24 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v9
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -77023,22 +77002,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -77351,7 +77314,23 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29
; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77375,22 +77354,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -77663,9 +77626,25 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77692,22 +77671,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -79163,13 +79126,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: s_branch .LBB105_2
; VI-NEXT: .LBB105_4:
-; VI-NEXT: v_mov_b32_e32 v1, s58
; VI-NEXT: v_mov_b32_e32 v53, s56
; VI-NEXT: v_mov_b32_e32 v52, s42
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s44
+; VI-NEXT: v_mov_b32_e32 v1, s58
; VI-NEXT: v_mov_b32_e32 v19, s67
; VI-NEXT: v_mov_b32_e32 v12, s66
; VI-NEXT: v_mov_b32_e32 v20, s65
@@ -79215,6 +79177,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v45, s78
; VI-NEXT: v_mov_b32_e32 v42, s76
; VI-NEXT: v_mov_b32_e32 v55, s74
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v54, s57
; VI-NEXT: v_mov_b32_e32 v41, s59
; VI-NEXT: v_mov_b32_e32 v44, s60
@@ -79320,6 +79283,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_readlane_b32 s67, v63, 19
; VI-NEXT: v_readlane_b32 s66, v63, 18
; VI-NEXT: v_readlane_b32 s65, v63, 17
@@ -79340,7 +79318,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79372,21 +79350,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -79756,6 +79719,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -79772,7 +79750,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79798,21 +79776,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -80286,6 +80249,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
@@ -80360,19 +80331,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36
; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37
@@ -80390,7 +80352,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v19, 0xff, v55
; SI-NEXT: v_or_b32_e32 v16, v19, v16
; SI-NEXT: v_cvt_f32_f16_e32 v34, v16
@@ -80403,7 +80365,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v12, 0xff, v18
; SI-NEXT: v_or_b32_e32 v10, v12, v10
; SI-NEXT: v_cvt_f32_f16_e32 v21, v10
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v10, 0xff, v41
; SI-NEXT: v_or_b32_e32 v8, v10, v8
; SI-NEXT: v_cvt_f32_f16_e32 v38, v8
@@ -80428,6 +80389,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v0, 0xff, v56
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_cvt_f32_f16_e32 v29, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v0, 0xff, v6
; SI-NEXT: v_or_b32_e32 v0, v0, v46
; SI-NEXT: v_cvt_f32_f16_e32 v54, v0
@@ -80634,13 +80596,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB106_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: v_or_b32_e32 v7, v3, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47
; SI-NEXT: v_or_b32_e32 v6, v46, v6
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -80648,12 +80609,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v9, v35, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_or_b32_e32 v7, v39, v7
; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
@@ -80852,13 +80811,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
; SI-NEXT: .LBB106_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -80875,14 +80827,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v8, v33
; SI-NEXT: v_mov_b32_e32 v10, v37
; SI-NEXT: v_mov_b32_e32 v12, v49
; SI-NEXT: v_mov_b32_e32 v14, v53
; SI-NEXT: v_mov_b32_e32 v16, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v18, v34
; SI-NEXT: v_mov_b32_e32 v20, v36
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v22, v38
; SI-NEXT: v_mov_b32_e32 v24, v48
; SI-NEXT: v_mov_b32_e32 v26, v50
@@ -84461,22 +84420,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-LABEL: bitcast_v32bf16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32
@@ -84542,6 +84485,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3
@@ -84605,11 +84564,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr58
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr50
@@ -85220,8 +85177,24 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -85249,22 +85222,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -85820,6 +85777,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -85853,22 +85826,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -86400,21 +86357,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17
-; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54
-; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -86431,6 +86377,17 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -88045,10 +88002,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: s_lshl_b32 s4, s76, 8
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42
; SI-NEXT: v_or_b32_e32 v3, s4, v3
; SI-NEXT: s_and_b32 s4, s74, 0xff
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -88076,22 +88049,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB109_4:
@@ -88805,6 +88762,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_readlane_b32 s67, v63, 19
@@ -88827,7 +88799,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -88857,21 +88829,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -90429,6 +90386,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124
; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13
; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27
@@ -90458,28 +90417,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12
; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt expcnt(0)
@@ -90496,8 +90457,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
@@ -90513,16 +90472,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23
; SI-NEXT: ; kill: killed $vgpr3
@@ -90803,7 +90754,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB110_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16
@@ -90829,7 +90779,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v58, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26
@@ -90841,7 +90790,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v46, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59
@@ -90854,7 +90802,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v12, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20
@@ -90868,7 +90815,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24
@@ -91086,11 +91032,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
; SI-NEXT: .LBB110_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v2, v43
; SI-NEXT: v_mov_b32_e32 v10, v41
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v28, v40
; SI-NEXT: v_mov_b32_e32 v30, v42
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -91109,6 +91052,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v4, v33
; SI-NEXT: v_mov_b32_e32 v6, v39
; SI-NEXT: v_mov_b32_e32 v8, v51
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 5d4df4b..46911e7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -4938,6 +4938,13 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
@@ -5037,13 +5044,6 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -12071,6 +12069,13 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
@@ -12170,13 +12175,6 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -18420,6 +18416,13 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
@@ -18519,13 +18522,6 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23978,6 +23972,13 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
@@ -24077,13 +24078,6 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v36i16_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
@@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr63
@@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v47, v9
; SI-NEXT: v_cvt_f32_f16_e32 v60, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
; SI-NEXT: v_cvt_f32_f16_e32 v45, v11
; SI-NEXT: v_cvt_f32_f16_e32 v58, v12
@@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -27147,8 +27138,24 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -27203,22 +27210,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -27760,6 +27751,17 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -27842,17 +27844,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB57_4:
@@ -28709,6 +28700,12 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55
; SI-NEXT: v_or_b32_e32 v20, v20, v21
@@ -28810,12 +28807,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -29421,9 +29412,15 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -29493,12 +29490,6 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 44cfd6c..6749dab 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -5557,10 +5554,23 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -5656,19 +5666,6 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -13756,10 +13750,23 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -13855,19 +13862,6 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -14435,6 +14429,10 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: .LBB33_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
@@ -14573,9 +14571,6 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -19249,6 +19244,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -19270,17 +19276,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -19302,13 +19297,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -21265,10 +21257,23 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -21364,19 +21369,6 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -25988,6 +25980,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -26009,17 +26012,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -26041,13 +26033,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -27984,10 +27973,23 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -28083,19 +28085,6 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -28635,6 +28624,11 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: .LBB53_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
@@ -28773,10 +28767,6 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -31389,6 +31379,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@@ -31405,17 +31406,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
@@ -31472,7 +31462,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -31523,7 +31513,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f32_f16_e32 v40, v48
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -31623,7 +31612,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -31643,7 +31631,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
@@ -31850,7 +31837,23 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -31888,22 +31891,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -32599,11 +32586,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -32620,6 +32602,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB57_4:
@@ -33574,8 +33561,20 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50
; SI-NEXT: v_or_b32_e32 v23, v23, v24
; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -33690,18 +33689,6 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -34417,6 +34404,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -34451,18 +34450,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 87d5157..6b13e96 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -6118,8 +6117,24 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6141,22 +6156,6 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -15081,8 +15079,24 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15104,22 +15118,6 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -15744,6 +15742,15 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
; SI-NEXT: .LBB33_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
@@ -15896,14 +15903,6 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -21004,6 +21003,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -21026,17 +21036,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -21054,9 +21053,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -23330,8 +23328,24 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -23353,22 +23367,6 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -28420,6 +28418,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -28442,17 +28451,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -28470,9 +28468,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -30724,8 +30721,24 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -30747,22 +30760,6 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -31355,6 +31352,16 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v50, v56
; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
; SI-NEXT: .LBB53_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
@@ -31507,15 +31514,6 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -34944,7 +34942,23 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -34982,22 +34996,6 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -35799,11 +35797,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -35820,6 +35813,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB57_4:
@@ -36879,9 +36877,19 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v26, v26, v27
; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0
; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40
; SI-NEXT: v_or_b32_e32 v26, v26, v27
; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0
; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
@@ -36994,16 +37002,6 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -37793,6 +37791,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
@@ -37827,22 +37841,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index fb2e94f..034b802 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -6622,8 +6615,24 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6699,22 +6708,6 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -13043,6 +13036,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v19, v19, v20
; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0
; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -13154,9 +13150,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB29_4:
@@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -16459,8 +16445,24 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -16536,22 +16538,6 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -17233,13 +17219,27 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT: v_or_b32_e32 v55, v55, v40
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_or_b32_e32 v55, v55, v40
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
@@ -17395,19 +17395,6 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -18157,6 +18144,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -18173,8 +18162,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -18201,34 +18188,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -22982,6 +22969,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -23006,22 +23009,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -23037,21 +23024,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -25559,8 +25539,24 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -25636,22 +25632,6 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -27128,6 +27108,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -27144,8 +27126,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -27172,34 +27152,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -30384,6 +30364,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v19, v19, v20
; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0
; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -30495,9 +30478,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB49_4:
@@ -31199,6 +31179,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -31223,22 +31219,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -31254,21 +31234,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -33752,8 +33725,24 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -33829,22 +33818,6 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -34491,13 +34464,28 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT: v_or_b32_e32 v55, v55, v40
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_or_b32_e32 v55, v55, v40
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
@@ -34653,20 +34641,6 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -35392,6 +35366,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -35408,8 +35384,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -35436,34 +35410,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -38336,8 +38310,24 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -38524,22 +38514,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -39451,14 +39425,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -39475,6 +39443,12 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB57_4:
@@ -40754,6 +40728,23 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23
; SI-NEXT: v_or_b32_e32 v3, v3, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0
@@ -40767,7 +40758,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0
@@ -40775,22 +40765,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -41255,6 +41229,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -41271,11 +41250,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v61, v2
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
@@ -41320,16 +41294,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v50, s25
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
; SI-NEXT: v_cvt_f16_f32_e32 v29, s29
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v32
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v43, v33
; SI-NEXT: v_cvt_f16_f32_e32 v32, v20
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v25, v35
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
@@ -41694,9 +41664,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -41713,6 +41680,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 07cdbef..8b6210d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -2741,9 +2741,14 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -2855,11 +2860,6 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7263,11 +7258,6 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7284,6 +7274,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8051,29 +8046,34 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_or_b32_e32 v43, v43, v44
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0
; SI-NEXT: v_or_b32_e32 v41, v41, v42
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0
; SI-NEXT: v_or_b32_e32 v55, v55, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0
; SI-NEXT: v_or_b32_e32 v53, v53, v54
@@ -8225,11 +8225,6 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB17_4:
@@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -13356,9 +13345,14 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13470,11 +13464,6 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -14209,6 +14198,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14272,14 +14269,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB29_4:
@@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -17998,11 +17982,6 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -18019,6 +17998,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -18938,14 +18922,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -18962,6 +18941,11 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -23409,9 +23387,14 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -23523,11 +23506,6 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -27946,11 +27919,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -27967,6 +27935,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -28748,29 +28721,34 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_or_b32_e32 v43, v43, v44
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0
; SI-NEXT: v_or_b32_e32 v41, v41, v42
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0
; SI-NEXT: v_or_b32_e32 v55, v55, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0
; SI-NEXT: v_or_b32_e32 v53, v53, v54
@@ -28922,11 +28900,6 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB45_4:
@@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -32398,9 +32365,14 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -32512,11 +32484,6 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -33212,6 +33179,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -33275,14 +33250,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB49_4:
@@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -36949,11 +36911,6 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -36970,6 +36927,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -37850,14 +37812,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -37874,6 +37831,11 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -41870,8 +41826,24 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -42102,22 +42074,6 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -43338,23 +43294,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_cvt_f16_f32_e32 v2, v58
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -43371,6 +43311,22 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -44684,9 +44640,25 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
@@ -44714,22 +44686,6 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v58, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
@@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v41, s21
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
; SI-NEXT: v_cvt_f16_f32_e32 v54, s29
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v32
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v33
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v20
; SI-NEXT: v_cvt_f16_f32_e32 v33, v24
; SI-NEXT: v_cvt_f16_f32_e32 v31, v28
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v55, v36
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v38
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v27, v39
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v49
; SI-NEXT: v_cvt_f16_f32_e32 v24, s18
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
@@ -45729,9 +45678,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45759,22 +45724,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8eb71e9..09cf278 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -2928,9 +2928,18 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3030,15 +3039,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7830,21 +7829,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7861,6 +7846,20 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8743,6 +8742,15 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0
@@ -8888,15 +8896,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB17_4:
@@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -14434,9 +14431,18 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14536,15 +14542,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -15382,9 +15379,21 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15412,18 +15421,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB29_4:
@@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -19455,21 +19451,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -19486,6 +19468,20 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -20473,28 +20469,12 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v43
; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v43
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v57
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v41
-; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v38
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -20511,6 +20491,22 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v52
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v38
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -25329,9 +25323,18 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -25431,15 +25434,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -30245,21 +30238,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -30276,6 +30255,20 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -31172,6 +31165,15 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0
@@ -31317,15 +31319,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB45_4:
@@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -35112,9 +35103,18 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -35214,15 +35214,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -36018,9 +36009,21 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -36048,18 +36051,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB49_4:
@@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -40041,14 +40031,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -40065,6 +40048,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -41035,11 +41025,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v21
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -41056,6 +41041,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v21
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -45395,8 +45383,24 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45662,22 +45666,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -47041,6 +47029,22 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -47061,22 +47065,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -48513,6 +48501,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
@@ -48535,22 +48539,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -49691,6 +49679,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -49712,22 +49716,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 93c11f1..4175d5f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -3108,9 +3108,22 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3186,19 +3199,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v30i32:
@@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -8387,11 +8381,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -8408,6 +8397,11 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9393,11 +9387,24 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0
; SI-NEXT: v_or_b32_e32 v36, v38, v36
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0
; SI-NEXT: v_or_b32_e32 v34, v36, v34
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -9519,19 +9526,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB17_4:
@@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB19_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -15531,9 +15520,22 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15609,19 +15611,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -16552,12 +16541,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -16573,6 +16557,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB29_4:
@@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v30f32:
@@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -20925,11 +20908,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -20946,6 +20924,11 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -21861,7 +21844,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v46
; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
@@ -22028,24 +22011,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v45
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
-; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v38
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -22062,6 +22028,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v54
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v38
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
@@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -24687,12 +24670,35 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -24702,7 +24708,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24711,7 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24720,7 +24726,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -24731,38 +24737,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB35_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -27300,9 +27278,22 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -27378,19 +27369,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v15i64:
@@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -32596,11 +32568,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -32617,6 +32584,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -33618,11 +33590,24 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0
; SI-NEXT: v_or_b32_e32 v36, v38, v36
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0
; SI-NEXT: v_or_b32_e32 v34, v36, v34
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -33744,19 +33729,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB45_4:
@@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB47_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -37922,9 +37889,22 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -38000,19 +37980,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -38899,12 +38866,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -38921,6 +38883,11 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB49_4:
@@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v15f64:
@@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -43173,8 +43134,24 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v62
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -43218,22 +43195,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -44289,11 +44250,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v10
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -44310,6 +44266,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
@@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -46905,12 +46866,35 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -46920,7 +46904,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46929,7 +46913,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46938,7 +46922,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -46949,38 +46933,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB55_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -49301,7 +49257,23 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v47
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -49339,22 +49311,6 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -50856,6 +50812,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -50876,22 +50848,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v9
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -52448,18 +52404,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -52476,6 +52421,17 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v51, v11
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v26
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT: v_mov_b32_e32 v8, v48
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v18, v3, v5
; SI-NEXT: v_cvt_f32_f16_e32 v5, v37
; SI-NEXT: v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v51, v11
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v6
; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT: v_mov_b32_e32 v8, v48
; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT: v_or_b32_e32 v6, v3, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v31
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: v_mov_b32_e32 v59, v48
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60
-; SI-NEXT: v_or_b32_e32 v4, v3, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT: v_or_b32_e32 v6, v3, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v24
; SI-NEXT: v_cvt_f32_f16_e32 v24, v8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v31
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60
+; SI-NEXT: v_or_b32_e32 v4, v3, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v20
; SI-NEXT: v_cvt_f32_f16_e32 v20, v39
+; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v31, v20
@@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v32, v41
; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16
; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v11, v24
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16
; SI-NEXT: v_mov_b32_e32 v39, v31
; SI-NEXT: v_mov_b32_e32 v31, v60
@@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v37, v55
; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16
; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16
; SI-NEXT: .LBB59_3: ; %end
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58
@@ -53662,15 +53615,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
@@ -53681,9 +53634,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -53693,11 +53648,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
@@ -53722,8 +53675,24 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
@@ -53748,22 +53717,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000..d7d623a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT: ret i32 [[CALL1]]
+;
+entry:
+ %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+ %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+ ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000..71c509a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT: ret void
+;
+ call void (...) @llvm.fake.use(i32 %arg)
+ ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing()
+ ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.assume(i1 %arg)
+ ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.trap()
+ ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @llvm.debugtrap()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.debugtrap()
+ ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.ubsantrap(i8 0)
+ ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 30ad46d9..b6b59d8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x70
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
-; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
@@ -9686,6 +9686,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
@@ -9816,17 +9827,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 6831380..04f8ad8 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1
; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX1100-NEXT: s_clause 0x1
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
@@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3
-; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_clause 0x1
; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 8e12e7e..832e43f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
; VI-NEXT: s_add_u32 s36, s36, s3
@@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; VI-NEXT: s_endpgm
@@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
; CI-NEXT: s_add_u32 s36, s36, s3
@@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; CI-NEXT: s_endpgm
@@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s3
@@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54ae..26f7789 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19
+; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3
+; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
; GISEL-NEXT: v_mov_b32_e32 v22, v19
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT: v_mov_b32_e32 v23, v14
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT: v_mov_b32_e32 v2, v23
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT: v_mov_b32_e32 v23, v25
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc
+; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28
; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33
+; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc
+; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33
; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc
; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20
; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v18, v26
+; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18]
+; GISEL-NEXT: v_mov_b32_e32 v22, v28
+; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31]
+; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = urem <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 0cae0e5..5cc6845 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: ds_write_b8 v0, v1 offset:9
+; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:5
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; CI-NEXT: ds_write_b8 v0, v1 offset:9
-; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
index 683887b..a4b3a85 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
@@ -476,7 +476,6 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
@@ -489,6 +488,7 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1029,7 +1029,6 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse
@@ -1040,6 +1039,7 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse
+; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0..da08f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
; CI-NEXT: v_or_b32_e32 v10, v14, v10
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; CI-NEXT: v_cvt_f16_f32_e32 v22, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; CI-NEXT: v_or_b32_e32 v13, v16, v13
; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_or_b32_e32 v19, v20, v19
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; CI-NEXT: v_cvt_f16_f32_e32 v21, v30
; CI-NEXT: v_or_b32_e32 v20, v22, v20
; CI-NEXT: v_cvt_f16_f32_e32 v22, v29
-; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_or_b32_e32 v21, v22, v21
; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
@@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT: v_or_b32_e32 v14, v15, v14
+; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT: v_or_b32_e32 v12, v12, v15
+; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
+; CI-NEXT: v_or_b32_e32 v11, v16, v11
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_or_b32_e32 v31, v32, v31
; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; CI-NEXT: v_or_b32_e32 v12, v12, v15
-; CI-NEXT: v_or_b32_e32 v11, v16, v11
-; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 0548bcf..590d69b 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -1,6 +1,19 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
+--- |
+
+ @foo = addrspace(3) global i32 poison
+
+ define void @test_overlap() { unreachable }
+ define void @test_dead_redef() { unreachable }
+ define void @test_tied() { unreachable }
+ define void @test_mmo_merge1() { unreachable }
+ define void @test_mmo_merge2() { unreachable }
+ define void @test_mmo_drop() { unreachable }
+
+...
+
---
name: test_overlap
body: |
@@ -34,3 +47,55 @@ body: |
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
+
+---
+name: test_tied
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_tied
+ ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec {
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+ ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
+ ; CHECK-NEXT: }
+ %1:vgpr_32 = COPY %0:vgpr_32
+ %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
+...
+
+---
+name: test_mmo_merge1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_merge1
+ ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: }
+ %1:vgpr_32 = COPY %0:vgpr_32
+ DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+...
+
+---
+name: test_mmo_merge2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_merge2
+ ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+ ; CHECK-NEXT: }
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+...
+
+---
+name: test_mmo_drop
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_drop
+ ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+ ; CHECK-NEXT: }
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e042157..460f121 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB0_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB0_7: ; %Flow2
@@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB1_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB1_7: ; %Flow2
@@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB2_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB2_7: ; %Flow2
@@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB3_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB3_7: ; %Flow2
@@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB6_4: ; %Flow
@@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index b750d28..d43c6ba 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v100, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: s_clause 0x1f
+; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8
@@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo
; GFX11-NEXT: s_addk_i32 s32, 0x90
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116
@@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v95, s33
; GFX11-NEXT: v_writelane_b32 v100, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v95, off, s33
; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8
@@ -2416,7 +2416,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148
; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152
; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
@@ -2459,7 +2458,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT: s_clause 0x8
+; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
@@ -2468,6 +2467,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-LABEL: return_72xi32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0xc
+; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204
@@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
-; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT: s_clause 0x10
; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
-; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT: s_clause 0xd
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
@@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT: s_clause 0xc
+; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172
@@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_add_i32 s32, s32, 0x28000
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0
; GFX9-NEXT: v_mov_b32_e32 v30, 0
; GFX9-NEXT: v_mov_b32_e32 v31, 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636
@@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: s_mov_b32 s38, s34
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 0
@@ -3006,9 +2991,24 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v31, 0
; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi
; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v63, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT: s_clause 0x28
+; GFX10-NEXT: s_clause 0x3e
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -3050,30 +3050,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792
; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
-; GFX10-NEXT: s_clause 0x15
; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548
; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552
; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556
@@ -3096,6 +3072,29 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624
; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632
+; GFX10-NEXT: s_waitcnt vmcnt(22)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
; GFX10-NEXT: v_mov_b32_e32 v0, 24
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108
; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32
@@ -3138,7 +3137,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152
; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544
@@ -3151,7 +3150,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT: s_clause 0xe
+; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8
@@ -3199,7 +3198,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_mov_b32 s36, s34
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_addk_i32 s32, 0xa00
-; GFX11-NEXT: s_clause 0xb
+; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36
@@ -3341,18 +3340,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: v_mov_b32_e32 v30, v46
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
+; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536
+; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
; GFX11-NEXT: s_add_i32 s2, s33, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, 42
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_clause 0xb
+; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v59, off, s33
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f807169..93d7eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 7e1055b..03b56ca 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 9689dda..68f9e83 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
; CHECK-LABEL: name: mimg
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
; CHECK-LABEL: name: mimg_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 31b6b53..f705a2f 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fa52b96..02eda2c 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -6,40 +6,12 @@
# No more registers shall be defined
---
name: main
-alignment: 1
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
tracksRegLiveness: true
registers:
- - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
- - { id: 2, class: vreg_64, preferred-register: '%2' }
- - { id: 3, class: vreg_64 }
- - { id: 4, class: vreg_64 }
- - { id: 5, class: vreg_64 }
- - { id: 6, class: vreg_96 }
- - { id: 7, class: vreg_96 }
- - { id: 8, class: vreg_128 }
- - { id: 9, class: vreg_128 }
-liveins:
- - { reg: '$sgpr6', virtual-reg: '%1' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
+ - { id: 0, class: sreg_32_xm0, preferred-register: '%0' }
+ - { id: 1, class: vreg_64, preferred-register: '%1' }
body: |
- bb.0.entry:
+ bb.0:
liveins: $sgpr0, $vgpr0_vgpr1
; CHECK-LABEL: name: main
@@ -59,20 +31,21 @@ body: |
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
- %3 = IMPLICIT_DEF
- undef %4.sub0 = COPY $sgpr0
- %4.sub1 = COPY %3.sub0
- undef %5.sub0 = COPY %4.sub1
- %5.sub1 = COPY %4.sub0
- FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr
+ %2:vreg_64 = IMPLICIT_DEF
+ undef %3.sub0:vreg_64 = COPY $sgpr0
+ %3.sub1:vreg_64 = COPY %2.sub0
+ undef %4.sub0:vreg_64 = COPY %3.sub1
+ %4.sub1:vreg_64 = COPY %3.sub0
+ FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr
- %6 = IMPLICIT_DEF
- undef %7.sub0_sub1 = COPY %6
- %7.sub2 = COPY %3.sub0
- FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr
+ %5:vreg_96 = IMPLICIT_DEF
+ undef %6.sub0_sub1:vreg_96 = COPY %5
+ %6.sub2:vreg_96 = COPY %2.sub0
+ FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr
+
+ %7:vreg_128 = IMPLICIT_DEF
+ undef %8.sub0_sub1_sub2:vreg_128 = COPY %7
+ %8.sub3:vreg_128 = COPY %2.sub0
+ FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr
- %8 = IMPLICIT_DEF
- undef %9.sub0_sub1_sub2 = COPY %8
- %9.sub3 = COPY %3.sub0
- FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr
...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index 4719ab9..cbf697f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,13 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
-; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-; MIR-NEXT: S_WAITCNT 0
-; MIR-NEXT: }
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+ ; MIR-LABEL: name: gws_barrier_offset0
+ ; MIR: bb.0 (%ir-block.0):
+ ; MIR-NEXT: liveins: $sgpr8_sgpr9
+ ; MIR-NEXT: {{ $}}
+ ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
+ ; MIR-NEXT: $m0 = S_MOV_B32 0
+ ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+ ; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+ ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+ ; MIR-NEXT: S_WAITCNT 0
+ ; MIR-NEXT: }
+ ; MIR-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
@@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index c5f6e2b..417b8e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -35,7 +35,7 @@
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
; MIR-NEXT: S_WAITCNT 0
; MIR-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c..af270e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25..72b4769 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
@@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0..d82d6bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
; GISEL12-NEXT: s_wait_kmcnt 0x0
; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
; GISEL12-NEXT: s_mov_b32 exec_lo, s9
-; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
; GISEL12-NEXT: .LBB5_2: ; %tail
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT: v_mov_b32_e32 v24, v0
-; GISEL10-NEXT: v_mov_b32_e32 v25, v1
-; GISEL10-NEXT: v_mov_b32_e32 v26, v2
-; GISEL10-NEXT: v_mov_b32_e32 v27, v3
-; GISEL10-NEXT: v_mov_b32_e32 v28, v4
-; GISEL10-NEXT: v_mov_b32_e32 v29, v5
-; GISEL10-NEXT: v_mov_b32_e32 v30, v6
-; GISEL10-NEXT: v_mov_b32_e32 v31, v7
-; GISEL10-NEXT: v_mov_b32_e32 v32, v8
-; GISEL10-NEXT: v_mov_b32_e32 v33, v9
-; GISEL10-NEXT: v_mov_b32_e32 v34, v10
-; GISEL10-NEXT: v_mov_b32_e32 v35, v11
-; GISEL10-NEXT: v_mov_b32_e32 v36, v12
-; GISEL10-NEXT: v_mov_b32_e32 v37, v13
-; GISEL10-NEXT: v_mov_b32_e32 v38, v14
-; GISEL10-NEXT: v_mov_b32_e32 v39, v15
+; GISEL10-NEXT: v_mov_b32_e32 v40, v0
+; GISEL10-NEXT: v_mov_b32_e32 v41, v1
+; GISEL10-NEXT: v_mov_b32_e32 v42, v2
+; GISEL10-NEXT: v_mov_b32_e32 v43, v3
+; GISEL10-NEXT: v_mov_b32_e32 v44, v4
+; GISEL10-NEXT: v_mov_b32_e32 v45, v5
+; GISEL10-NEXT: v_mov_b32_e32 v46, v6
+; GISEL10-NEXT: v_mov_b32_e32 v47, v7
+; GISEL10-NEXT: v_mov_b32_e32 v48, v8
+; GISEL10-NEXT: v_mov_b32_e32 v49, v9
+; GISEL10-NEXT: v_mov_b32_e32 v50, v10
+; GISEL10-NEXT: v_mov_b32_e32 v51, v11
+; GISEL10-NEXT: v_mov_b32_e32 v52, v12
+; GISEL10-NEXT: v_mov_b32_e32 v53, v13
+; GISEL10-NEXT: v_mov_b32_e32 v54, v14
+; GISEL10-NEXT: v_mov_b32_e32 v55, v15
; GISEL10-NEXT: s_mov_b32 exec_lo, s9
-; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT: v_mov_b32_e32 v24, v40
+; GISEL10-NEXT: v_mov_b32_e32 v25, v41
+; GISEL10-NEXT: v_mov_b32_e32 v26, v42
+; GISEL10-NEXT: v_mov_b32_e32 v27, v43
+; GISEL10-NEXT: v_mov_b32_e32 v28, v44
+; GISEL10-NEXT: v_mov_b32_e32 v29, v45
+; GISEL10-NEXT: v_mov_b32_e32 v30, v46
+; GISEL10-NEXT: v_mov_b32_e32 v31, v47
+; GISEL10-NEXT: v_mov_b32_e32 v32, v48
+; GISEL10-NEXT: v_mov_b32_e32 v33, v49
+; GISEL10-NEXT: v_mov_b32_e32 v34, v50
+; GISEL10-NEXT: v_mov_b32_e32 v35, v51
+; GISEL10-NEXT: v_mov_b32_e32 v36, v52
+; GISEL10-NEXT: v_mov_b32_e32 v37, v53
+; GISEL10-NEXT: v_mov_b32_e32 v38, v54
+; GISEL10-NEXT: v_mov_b32_e32 v39, v55
; GISEL10-NEXT: .LBB5_2: ; %tail
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GISEL10-NEXT: v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0c1448a..1d08097 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT: v_mov_b32_e32 v4, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v5, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
@@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
@@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; GCN-NEXT: v_mov_b32_e32 v5, s16
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f0..83c240c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s2, s0, 0x150
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15]
-; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x140
@@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x130
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v17, s3
; GFX8-NEXT: v_mov_b32_e32 v16, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x120
@@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x110
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
; GFX8-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v8, s12
-; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: v_mov_b32_e32 v9, s13
-; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: v_mov_b32_e32 v10, s14
; GFX8-NEXT: v_mov_b32_e32 v11, s15
; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31]
@@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
+; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d0..59f4a9d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e55fb2ca..7203545 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
@@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12)
+; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
@@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
+; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
@@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc6..cb17f01 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a3..062a985 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
-; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17
-; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19
@@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19
; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
-; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24
@@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
-; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: s_nop 0
-; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: s_nop 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17
@@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19
@@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
-; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
-; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
-; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
-; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
-; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
-; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
-; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
-; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
-; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: s_nop 0
-; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: s_nop 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
@@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 1d1d3e4..9da7a79 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
@@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
@@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index fc36ed9..84db54c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
@@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 4ab05c2..c1f4d7b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,12 +19,12 @@ $_f2 = comdat any
define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 {
; GCN-LABEL: test:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: ds_write_b8 v1, v0
; GCN-NEXT: ds_read_u8 v2, v1 offset:2
; GCN-NEXT: ds_read_u16 v3, v1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 2
+; GCN-NEXT: ds_write_b8 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v1, v2 offset:6
; GCN-NEXT: ds_write_b16 v1, v3 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index 24c1bfb..ccfd45b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -75,15 +75,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
; GCN-LABEL: no_clobber_ds_load_stores_x3:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, 2
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: ds_write_b32 v1, v2 offset:256
+; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s0, s0, 2
-; GCN-NEXT: v_mov_b32_e32 v2, 3
-; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ds_write_b32 v1, v2 offset:256
+; GCN-NEXT: v_mov_b32_e32 v2, 3
; GCN-NEXT: ds_write_b32 v1, v2 offset:512
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v3, v0 offset:256
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index ae08054..ba53294 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 02f39e2..af7ca0f 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1
; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1
; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142
; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140
@@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130
; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139
; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137
; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141
@@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6
@@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7
@@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
@@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x33
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
@@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
@@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(42)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(38)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(36)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
@@ -4251,259 +4327,132 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10
+; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x6
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114
; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98
-; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94
; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110
; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108
; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88
; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76
; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57
; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37
; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30
; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118
; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41
; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73
; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8
; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4513,37 +4462,34 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120
+; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1
+; ALIGNED-NEXT: v_mov_b32_e32 v1, v107
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0
-; ALIGNED-NEXT: v_mov_b32_e32 v1, v107
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16
@@ -4553,6 +4499,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202
; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203
; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201
@@ -4641,21 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198
; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196
; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
-; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186
; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187
; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185
@@ -4672,18 +4605,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182
; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180
; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170
; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171
; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169
@@ -4700,6 +4621,36 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166
; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164
; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160
+; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154
+; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153
+; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -4712,10 +4663,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154
-; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153
-; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157
@@ -5181,6 +5128,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10
+; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
@@ -5234,8 +5183,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10
-; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13
@@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5
; ALIGNED-NEXT: .LBB5_6: ; %Flow6
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5
; ALIGNED-NEXT: .LBB6_6: ; %Flow8
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6
@@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23
; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7
@@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142
; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140
@@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130
; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139
; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137
; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141
@@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6
@@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7
@@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5
; ALIGNED-NEXT: .LBB8_6: ; %Flow19
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-LABEL: memmove_p0_p5_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: s_mov_b32 s6, exec_lo
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT: s_mov_b32 s6, exec_lo
; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0
; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB9_2
; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
@@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: s_waitcnt vmcnt(43)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(42)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(32)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(16)
+; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20
+; ALIGNED-NEXT: s_waitcnt vmcnt(12)
+; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x5
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3
-; ALIGNED-NEXT: s_waitcnt vmcnt(59)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100
-; ALIGNED-NEXT: s_waitcnt vmcnt(13)
-; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20
-; ALIGNED-NEXT: s_waitcnt vmcnt(9)
-; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120
; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98
; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111
; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84
; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92
; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75
; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79
; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49
; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34
; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56
; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28
; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57
; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95
; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10
; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95
; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
@@ -13509,7 +13410,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
@@ -13518,10 +13418,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708
; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
@@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202
; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203
; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201
@@ -13606,22 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198
; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196
; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186
; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187
; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185
@@ -13638,18 +13525,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182
; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180
; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170
; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171
; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169
@@ -13666,6 +13541,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166
; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164
; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160
+; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154
+; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153
+; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -13678,10 +13583,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154
-; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153
-; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157
@@ -14147,6 +14048,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10
+; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -14200,8 +14103,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10
-; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13
@@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_mov_b32 s7, -1
; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
@@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
@@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x34
+; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5
@@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x5
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(51)
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
+; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
+; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(46)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(43)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
@@ -14763,8 +14748,15 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
@@ -14772,250 +14764,110 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x6
-; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen
-; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123
; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2
-; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84
; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89
; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117
; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen
; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
@@ -15027,36 +14879,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76
-; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122
; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16
@@ -15065,10 +14915,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484
; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708
; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
@@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468
; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202
; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203
; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201
@@ -15153,22 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198
; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196
; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192
-; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
-; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186
; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187
; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185
@@ -15185,18 +15022,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182
; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180
; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170
; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171
; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169
@@ -15213,6 +15038,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166
; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164
; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160
+; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154
+; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155
+; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153
+; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520
@@ -15225,10 +15080,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154
-; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155
-; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153
-; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157
@@ -15694,6 +15545,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10
+; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11
+; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13
+; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -15747,10 +15602,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
-; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10
-; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11
-; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13
-; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15
@@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4
; ALIGNED-NEXT: .LBB9_5: ; %Flow11
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 71900a4..3280048 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null
+; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
-; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 78207c2..1177474 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
@@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
@@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
@@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
@@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
@@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index d0d5cc1..025d9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -56,11 +56,11 @@ body: |
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: }
@@ -359,6 +359,7 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABLE: name: no_sched_barrier_within_bundle
+ ; GCN-LABEL: name: no_sched_barrier_within_bundle
; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index 5fea0ae..e0266b9 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vimage
; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vsample
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba..b91bdd2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
@@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
-; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
; GFX900-NEXT: s_addk_i32 s5, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
@@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
-; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT: s_addk_i32 s3, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT: s_waitcnt vmcnt(8)
+; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(7)
+; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(6)
+; GFX90A-NEXT: s_waitcnt vmcnt(8)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(5)
+; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(4)
+; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(3)
+; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
new file mode 100644
index 0000000..381cb8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s
+
+# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask.
+
+---
+name: reg_coalescer_subreg_liveness
+tracksRegLiveness: true
+liveins:
+body: |
+ ; CHECK-LABEL: name: reg_coalescer_subreg_liveness
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: $vcc_lo = COPY $exec_lo
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ %2:sreg_32 = S_MOV_B32 1
+ undef %3.sub0:sgpr_128 = COPY %2
+ %4:sreg_32 = S_MOV_B32 0
+ undef %5.sub0:sgpr_256 = COPY %4
+ TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %6:sgpr_128 = COPY killed %3
+ %6.sub1:sgpr_128 = COPY killed %1
+ %7:sreg_32 = COPY $exec_lo
+ %8:sreg_32 = COPY %2
+ %9:sreg_32 = COPY %4
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+ %10:sreg_32 = COPY killed %8
+ undef %11.sub0:sgpr_128 = COPY %2
+ %11.sub1:sgpr_128 = COPY killed %10
+ %11.sub2:sgpr_128 = COPY %2
+ %11.sub3:sgpr_128 = COPY %2
+ TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %12:sreg_32 = COPY killed %9
+ %13:sgpr_128 = COPY %6
+ %13.sub2:sgpr_128 = COPY killed %12
+ TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ $vcc_lo = COPY %7
+ %8:sreg_32 = COPY %4
+ %9:sreg_32 = COPY %2
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+---
+name: reg_coalescer_subreg_liveness_2
+tracksRegLiveness: true
+liveins:
+body: |
+ ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]]
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %3:sreg_32 = S_MOV_B32 1
+ undef %4.sub0:sgpr_128 = COPY %3
+ %5:sgpr_128 = COPY %4
+ %5.sub1:sgpr_128 = COPY killed %2
+ %6:sgpr_128 = COPY %5
+ %6.sub2:sgpr_128 = COPY killed %1
+ %7:sreg_32 = S_MOV_B32 0
+ undef %8.sub0:sgpr_256 = COPY %7
+ %9:sreg_32 = COPY %3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %10:sreg_32 = COPY killed %9
+ undef %11.sub0:sgpr_128 = COPY %3
+ %11.sub1:sgpr_128 = COPY killed %10
+ S_NOP 0, implicit %5, implicit %8
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8..fb9c477 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,43 +28,38 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -74,49 +69,40 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -125,6 +111,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -139,6 +138,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -153,84 +153,70 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -238,6 +224,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -252,6 +251,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -266,83 +266,69 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -350,6 +336,18 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -364,6 +362,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -414,7 +413,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -422,24 +420,23 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -447,9 +444,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -457,6 +453,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -475,6 +474,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -503,7 +504,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -525,7 +525,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -533,24 +532,23 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -558,9 +556,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -568,6 +565,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -586,6 +586,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -974,42 +976,43 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -1024,8 +1027,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -1051,43 +1053,38 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1097,49 +1094,40 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1148,6 +1136,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1162,6 +1163,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -1176,84 +1178,70 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1261,6 +1249,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1275,6 +1276,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -1289,83 +1291,69 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1373,6 +1361,18 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -1387,6 +1387,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -1415,7 +1417,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1437,7 +1438,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1445,24 +1445,23 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1470,9 +1469,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1480,6 +1478,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1498,6 +1499,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -1526,7 +1529,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1548,7 +1550,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1556,24 +1557,23 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1581,9 +1581,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1591,6 +1590,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1609,6 +1611,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -1997,42 +2001,43 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -2047,8 +2052,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -2074,43 +2078,38 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2120,49 +2119,40 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2171,6 +2161,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2185,6 +2188,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -2199,84 +2203,70 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2284,6 +2274,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2298,6 +2301,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -2312,83 +2316,69 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2396,6 +2386,18 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -2410,6 +2412,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -2438,7 +2442,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2460,7 +2463,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2468,24 +2470,23 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2493,9 +2494,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,6 +2503,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2521,6 +2524,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -2549,7 +2554,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2571,7 +2575,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2579,24 +2582,23 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2604,9 +2606,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2614,6 +2615,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2632,6 +2636,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -3020,42 +3026,43 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -3070,8 +3077,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -3097,43 +3103,38 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3143,49 +3144,40 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3194,6 +3186,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3208,6 +3213,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -3222,84 +3228,70 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3307,6 +3299,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3321,6 +3326,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -3334,83 +3340,69 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3418,6 +3410,18 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -3432,6 +3436,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -3459,7 +3465,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3481,7 +3486,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3489,24 +3493,23 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3514,9 +3517,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3524,6 +3526,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3542,6 +3547,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -3569,7 +3576,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3591,7 +3597,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3599,24 +3604,23 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3624,9 +3628,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3634,6 +3637,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3652,6 +3658,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -4040,42 +4048,43 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -4090,8 +4099,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -4117,43 +4125,38 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4163,49 +4166,40 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4214,6 +4208,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4228,6 +4235,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -4242,84 +4250,70 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4327,6 +4321,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4341,6 +4348,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, v0, v1
@@ -4354,83 +4362,69 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4438,6 +4432,18 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -4452,6 +4458,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -4479,7 +4487,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4501,7 +4508,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4509,24 +4515,23 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4534,9 +4539,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4544,6 +4548,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4562,6 +4569,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -4589,7 +4598,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4611,7 +4619,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4619,24 +4626,23 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4644,9 +4650,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4654,6 +4659,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4672,6 +4680,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
@@ -5060,42 +5070,43 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -5110,8 +5121,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -5141,43 +5151,38 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5187,49 +5192,40 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5238,6 +5234,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5252,6 +5261,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; SI-NEXT: s_mov_b32 s2, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -5267,84 +5277,70 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5352,6 +5348,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5366,6 +5375,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; VI-NEXT: s_mov_b32 s2, s5
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5380,83 +5390,69 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5464,6 +5460,18 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5478,8 +5486,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-MUBUF-NEXT: ; return to shader part epilog
@@ -5491,10 +5500,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5505,8 +5514,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5528,7 +5535,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5536,24 +5542,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5561,9 +5568,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5571,6 +5577,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5589,6 +5597,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -5602,10 +5612,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5616,8 +5626,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5639,7 +5647,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5647,24 +5654,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5672,9 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5682,6 +5689,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5700,6 +5709,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -6093,10 +6104,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -6105,29 +6116,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -6142,8 +6155,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -6172,43 +6184,38 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
-; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6218,49 +6225,40 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6269,6 +6267,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6283,6 +6294,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; SI-NEXT: s_mov_b32 s2, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -6298,84 +6310,70 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6383,6 +6381,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6397,6 +6408,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; VI-NEXT: s_mov_b32 s2, s5
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6411,83 +6423,69 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT: s_nop 0
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6495,6 +6493,18 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6509,8 +6519,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-MUBUF-NEXT: ; return to shader part epilog
@@ -6522,10 +6533,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6536,8 +6547,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6559,7 +6568,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6567,24 +6575,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6592,9 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6602,6 +6610,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6620,6 +6630,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: s_clause 0x1
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -6633,10 +6645,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6647,8 +6659,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6670,7 +6680,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6678,24 +6687,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6703,9 +6713,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6713,6 +6722,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6731,6 +6742,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: s_clause 0x1
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -7124,10 +7137,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -7136,29 +7149,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -7173,8 +7188,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index 71e4755..c90d788 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -3,9 +3,6 @@
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
; CHECK-LABEL: excess_soft_clause_reg_pressure:
; CHECK: BB0_1: ; %for.cond28.preheader
-; CHECK: s_load_dwordx16
-; CHECK-NEXT: s_load_dwordx16
-
; CHECK: global_load_dword
; CHECK-NEXT: global_load_dword
; CHECK-NEXT: global_load_dword
@@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
; CHECK-NOT: v_readlane_b32
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
; CHECK-NOT: v_writelane_b32
; CHECK-NOT: v_readlane_b32
+
entry:
%i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%i2 = load i64, ptr addrspace(4) %i, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index da48af1..1a0f75e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
-; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b6..b5474b8 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
@@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
@@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
@@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 9cb22da..802de80 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_writelane_b32 v40, s34, 3
; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 4d5ade4..1b4ed67 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:
diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
new file mode 100644
index 0000000..4e73cee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; RV32-LABEL: mask_pair:
+; RV32: # %bb.0:
+; RV32-NEXT: srl a0, a0, a1
+; RV32-NEXT: sll a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair:
+; RV64: # %bb.0:
+; RV64-NEXT: srlw a0, a0, a1
+; RV64-NEXT: sllw a0, a0, a1
+; RV64-NEXT: ret
+ %shl = shl nsw i32 -1, %y
+ %and = and i32 %shl, %x
+ ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; RV32-LABEL: mask_pair_64:
+; RV32: # %bb.0:
+; RV32-NEXT: li a3, -1
+; RV32-NEXT: addi a4, a2, -32
+; RV32-NEXT: sll a3, a3, a2
+; RV32-NEXT: bltz a4, .LBB1_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a2, a3
+; RV32-NEXT: j .LBB1_3
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: lui a5, 524288
+; RV32-NEXT: addi a5, a5, -1
+; RV32-NEXT: srl a2, a5, a2
+; RV32-NEXT: or a2, a3, a2
+; RV32-NEXT: .LBB1_3:
+; RV32-NEXT: srai a4, a4, 31
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: and a0, a3, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair_64:
+; RV64: # %bb.0:
+; RV64-NEXT: srl a0, a0, a1
+; RV64-NEXT: sll a0, a0, a1
+; RV64-NEXT: ret
+ %shl = shl nsw i64 -1, %y
+ %and = and i64 %shl, %x
+ ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; RV32-LABEL: mask_pair_128:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: lw a5, 0(a1)
+; RV32-NEXT: lw a4, 4(a1)
+; RV32-NEXT: lw a3, 8(a1)
+; RV32-NEXT: lw a1, 12(a1)
+; RV32-NEXT: lw a2, 0(a2)
+; RV32-NEXT: li a6, -1
+; RV32-NEXT: sw zero, 0(sp)
+; RV32-NEXT: sw zero, 4(sp)
+; RV32-NEXT: sw zero, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi a7, sp, 16
+; RV32-NEXT: sw a6, 16(sp)
+; RV32-NEXT: sw a6, 20(sp)
+; RV32-NEXT: sw a6, 24(sp)
+; RV32-NEXT: sw a6, 28(sp)
+; RV32-NEXT: srli a6, a2, 3
+; RV32-NEXT: andi a6, a6, 12
+; RV32-NEXT: sub a6, a7, a6
+; RV32-NEXT: lw a7, 4(a6)
+; RV32-NEXT: lw t0, 8(a6)
+; RV32-NEXT: lw t1, 12(a6)
+; RV32-NEXT: lw a6, 0(a6)
+; RV32-NEXT: andi t2, a2, 31
+; RV32-NEXT: xori t2, t2, 31
+; RV32-NEXT: sll t1, t1, a2
+; RV32-NEXT: srli t3, t0, 1
+; RV32-NEXT: sll t0, t0, a2
+; RV32-NEXT: srli t4, a7, 1
+; RV32-NEXT: sll a7, a7, a2
+; RV32-NEXT: sll a2, a6, a2
+; RV32-NEXT: srli a6, a6, 1
+; RV32-NEXT: srl t3, t3, t2
+; RV32-NEXT: srl t4, t4, t2
+; RV32-NEXT: srl a6, a6, t2
+; RV32-NEXT: and a2, a2, a5
+; RV32-NEXT: or a5, t1, t3
+; RV32-NEXT: or t0, t0, t4
+; RV32-NEXT: or a6, a7, a6
+; RV32-NEXT: and a4, a6, a4
+; RV32-NEXT: and a3, t0, a3
+; RV32-NEXT: and a1, a5, a1
+; RV32-NEXT: sw a2, 0(a0)
+; RV32-NEXT: sw a4, 4(a0)
+; RV32-NEXT: sw a3, 8(a0)
+; RV32-NEXT: sw a1, 12(a0)
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair_128:
+; RV64: # %bb.0:
+; RV64-NEXT: li a5, -1
+; RV64-NEXT: addi a4, a2, -64
+; RV64-NEXT: sll a3, a5, a2
+; RV64-NEXT: bltz a4, .LBB2_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a2, a3
+; RV64-NEXT: j .LBB2_3
+; RV64-NEXT: .LBB2_2:
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a5, 1
+; RV64-NEXT: srl a2, a5, a2
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: .LBB2_3:
+; RV64-NEXT: srai a4, a4, 63
+; RV64-NEXT: and a3, a4, a3
+; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a0, a3, a0
+; RV64-NEXT: ret
+ %shl = shl nsw i128 -1, %y
+ %and = and i128 %shl, %x
+ ret i128 %and
+}
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9..ff9b6a3 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -22,10 +22,10 @@ define void @main(i16 %in) {
; CHECK-NEXT: locghile %r3, 1
; CHECK-NEXT: o %r0, 0(%r1)
; CHECK-NEXT: larl %r1, g_222
-; CHECK-NEXT: lghi %r5, 0
; CHECK-NEXT: dsgfr %r2, %r0
+; CHECK-NEXT: lghi %r3, 0
; CHECK-NEXT: stgrl %r2, g_39
-; CHECK-NEXT: stc %r5, 19(%r1)
+; CHECK-NEXT: stc %r3, 19(%r1)
; CHECK-NEXT: br %r14
%tmp = load i32, ptr @g_151, align 4
%tmp3 = or i32 %tmp, 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index ee2e58f..a1771f9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -98,28 +98,29 @@ body: |
; CHECK-LABEL: name: foo
; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
- ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
- ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
- ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
- ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
- ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
- ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
- ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
- ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
- ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
- ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
- ; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
- ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
- ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
- ; CHECK: }
- ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
- ; CHECK: MVE_VPST 4, implicit $vpr
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
- ; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
- ; CHECK: }
- ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
+ ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
+ ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
+ ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
+ ; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
+ ; CHECK-NEXT: MVE_VPST 4, implicit $vpr
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
$sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index 9c63819..1cfda8a 100644
--- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr)
; CHECK-LABEL: test_invalid_rtn:
; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}}
; CHECK-NEXT: drop $pop[[L1]]{{$}}
; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
; CHECK-LABEL: test_invalid_arg:
; CHECK: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
; CHECK-NEXT: drop $pop[[L1]]{{$}}
; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
; CHECK-NEXT: call $push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}}
@@ -54,8 +54,8 @@ entry:
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
-; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1:
+; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32)
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
@@ -64,7 +64,7 @@ entry:
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
-; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2:
+; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32)
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function