aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll22769
1 files changed, 11889 insertions, 10880 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 39da45b..3e2b488 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2144,112 +2144,110 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s5, v1
+; SI-NEXT: v_mov_b32_e32 v33, v1
+; SI-NEXT: v_mov_b32_e32 v32, v0
+; SI-NEXT: v_mov_b32_e32 v34, s16
+; SI-NEXT: v_mov_b32_e32 v35, s17
+; SI-NEXT: v_mov_b32_e32 v36, s18
+; SI-NEXT: v_mov_b32_e32 v37, s19
+; SI-NEXT: v_mov_b32_e32 v38, s20
+; SI-NEXT: v_mov_b32_e32 v39, s21
+; SI-NEXT: v_mov_b32_e32 v48, s22
+; SI-NEXT: v_mov_b32_e32 v49, s23
+; SI-NEXT: v_mov_b32_e32 v50, s24
+; SI-NEXT: v_mov_b32_e32 v51, s25
+; SI-NEXT: v_mov_b32_e32 v52, s26
+; SI-NEXT: v_mov_b32_e32 v53, s27
+; SI-NEXT: v_mov_b32_e32 v54, s28
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mov_b32_e32 v55, s29
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s56, s5, 16
-; SI-NEXT: s_lshr_b32 s57, s29, 16
-; SI-NEXT: s_lshr_b32 s58, s27, 16
-; SI-NEXT: s_lshr_b32 s59, s25, 16
-; SI-NEXT: s_lshr_b32 s60, s23, 16
-; SI-NEXT: s_lshr_b32 s61, s21, 16
-; SI-NEXT: s_lshr_b32 s62, s19, 16
-; SI-NEXT: s_lshr_b32 s63, s17, 16
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35
+; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16
; SI-NEXT: s_cbranch_execnz .LBB13_3
; SI-NEXT: .LBB13_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s17, s17, 3
-; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: s_add_i32 s21, s21, 3
-; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_add_i32 s22, s22, 3
-; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_add_i32 s24, s24, 3
-; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: s_add_i32 s28, s28, 3
-; SI-NEXT: s_add_i32 s5, s5, 3
-; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16
-; SI-NEXT: s_lshr_b32 s56, s5, 16
-; SI-NEXT: s_lshr_b32 s57, s29, 16
-; SI-NEXT: s_lshr_b32 s58, s27, 16
-; SI-NEXT: s_lshr_b32 s59, s25, 16
-; SI-NEXT: s_lshr_b32 s60, s23, 16
-; SI-NEXT: s_lshr_b32 s61, s21, 16
-; SI-NEXT: s_lshr_b32 s62, s19, 16
-; SI-NEXT: s_lshr_b32 s63, s17, 16
+; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35
+; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
+; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37
+; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36
+; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
+; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38
+; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49
+; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
+; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51
+; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50
+; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53
+; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52
+; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55
+; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54
+; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33
+; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
+; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35
; SI-NEXT: .LBB13_3: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: v_mov_b32_e32 v1, s44
-; SI-NEXT: v_mov_b32_e32 v2, s17
-; SI-NEXT: v_mov_b32_e32 v3, s63
-; SI-NEXT: v_mov_b32_e32 v4, s18
-; SI-NEXT: v_mov_b32_e32 v5, s42
-; SI-NEXT: v_mov_b32_e32 v6, s19
-; SI-NEXT: v_mov_b32_e32 v7, s62
-; SI-NEXT: v_mov_b32_e32 v8, s20
-; SI-NEXT: v_mov_b32_e32 v9, s40
-; SI-NEXT: v_mov_b32_e32 v10, s21
-; SI-NEXT: v_mov_b32_e32 v11, s61
-; SI-NEXT: v_mov_b32_e32 v12, s22
-; SI-NEXT: v_mov_b32_e32 v13, s14
-; SI-NEXT: v_mov_b32_e32 v14, s23
-; SI-NEXT: v_mov_b32_e32 v15, s60
-; SI-NEXT: v_mov_b32_e32 v16, s24
-; SI-NEXT: v_mov_b32_e32 v17, s12
-; SI-NEXT: v_mov_b32_e32 v18, s25
-; SI-NEXT: v_mov_b32_e32 v19, s59
-; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_mov_b32_e32 v21, s10
-; SI-NEXT: v_mov_b32_e32 v22, s27
-; SI-NEXT: v_mov_b32_e32 v23, s58
-; SI-NEXT: v_mov_b32_e32 v24, s28
-; SI-NEXT: v_mov_b32_e32 v25, s8
-; SI-NEXT: v_mov_b32_e32 v26, s29
-; SI-NEXT: v_mov_b32_e32 v27, s57
-; SI-NEXT: v_mov_b32_e32 v28, s4
-; SI-NEXT: v_mov_b32_e32 v29, s6
-; SI-NEXT: v_mov_b32_e32 v30, s5
-; SI-NEXT: v_mov_b32_e32 v31, s56
+; SI-NEXT: v_mov_b32_e32 v0, v34
+; SI-NEXT: v_mov_b32_e32 v2, v35
+; SI-NEXT: v_mov_b32_e32 v4, v36
+; SI-NEXT: v_mov_b32_e32 v6, v37
+; SI-NEXT: v_mov_b32_e32 v8, v38
+; SI-NEXT: v_mov_b32_e32 v10, v39
+; SI-NEXT: v_mov_b32_e32 v12, v48
+; SI-NEXT: v_mov_b32_e32 v14, v49
+; SI-NEXT: v_mov_b32_e32 v16, v50
+; SI-NEXT: v_mov_b32_e32 v18, v51
+; SI-NEXT: v_mov_b32_e32 v20, v52
+; SI-NEXT: v_mov_b32_e32 v22, v53
+; SI-NEXT: v_mov_b32_e32 v24, v54
+; SI-NEXT: v_mov_b32_e32 v26, v55
+; SI-NEXT: v_mov_b32_e32 v28, v32
+; SI-NEXT: v_mov_b32_e32 v30, v33
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB13_4:
-; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: s_branch .LBB13_2
;
; VI-LABEL: bitcast_v16i32_to_v32i16_scalar:
@@ -2882,111 +2880,139 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
; VI-LABEL: bitcast_v32i16_to_v16i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s6, v3
+; VI-NEXT: v_readfirstlane_b32 s7, v4
+; VI-NEXT: v_readfirstlane_b32 s8, v5
+; VI-NEXT: v_readfirstlane_b32 s9, v6
+; VI-NEXT: v_readfirstlane_b32 s10, v7
+; VI-NEXT: v_readfirstlane_b32 s11, v8
+; VI-NEXT: v_readfirstlane_b32 s12, v9
+; VI-NEXT: v_readfirstlane_b32 s13, v10
+; VI-NEXT: v_readfirstlane_b32 s14, v11
+; VI-NEXT: v_readfirstlane_b32 s15, v12
+; VI-NEXT: v_readfirstlane_b32 s16, v13
+; VI-NEXT: v_readfirstlane_b32 s17, v14
+; VI-NEXT: v_readfirstlane_b32 s18, v15
+; VI-NEXT: v_readfirstlane_b32 s19, v16
+; VI-NEXT: v_readfirstlane_b32 s20, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v1
; VI-NEXT: s_cbranch_scc0 .LBB15_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB15_3
; VI-NEXT: .LBB15_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s5, s7, 3
-; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s6, 3
-; VI-NEXT: s_add_i32 s7, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s20, 3
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s29, 3
-; VI-NEXT: s_add_i32 s6, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s19, 3
+; VI-NEXT: s_add_i32 s20, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s28, 3
-; VI-NEXT: s_add_i32 s29, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s18, 3
+; VI-NEXT: s_add_i32 s19, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s27, 3
-; VI-NEXT: s_add_i32 s28, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s17, 3
+; VI-NEXT: s_add_i32 s18, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s26, 3
-; VI-NEXT: s_add_i32 s27, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s16, 3
+; VI-NEXT: s_add_i32 s17, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s25, 3
-; VI-NEXT: s_add_i32 s26, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s15, 3
+; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s15, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s24, 3
-; VI-NEXT: s_add_i32 s25, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s14, 3
+; VI-NEXT: s_add_i32 s15, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s14, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s23, 3
-; VI-NEXT: s_add_i32 s24, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s13, 3
+; VI-NEXT: s_add_i32 s14, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s13, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s22, 3
-; VI-NEXT: s_add_i32 s23, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s12, 3
+; VI-NEXT: s_add_i32 s13, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s12, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s21, 3
-; VI-NEXT: s_add_i32 s22, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s11, 3
+; VI-NEXT: s_add_i32 s12, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s11, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s20, 3
-; VI-NEXT: s_add_i32 s21, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s10, 3
+; VI-NEXT: s_add_i32 s11, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s10, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s19, 3
-; VI-NEXT: s_add_i32 s20, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s9, 3
+; VI-NEXT: s_add_i32 s10, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s9, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s18, 3
-; VI-NEXT: s_add_i32 s19, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s8, 3
+; VI-NEXT: s_add_i32 s9, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s8, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s17, 3
-; VI-NEXT: s_add_i32 s18, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s7, 3
+; VI-NEXT: s_add_i32 s8, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_add_i32 s17, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s6, 3
+; VI-NEXT: s_add_i32 s7, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s6, s4, 0x30000
; VI-NEXT: .LBB15_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_mov_b32_e32 v7, s13
+; VI-NEXT: v_mov_b32_e32 v8, s14
+; VI-NEXT: v_mov_b32_e32 v9, s15
+; VI-NEXT: v_mov_b32_e32 v10, s16
+; VI-NEXT: v_mov_b32_e32 v11, s17
+; VI-NEXT: v_mov_b32_e32 v12, s18
+; VI-NEXT: v_mov_b32_e32 v13, s19
+; VI-NEXT: v_mov_b32_e32 v14, s20
+; VI-NEXT: v_mov_b32_e32 v15, s21
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB15_4:
; VI-NEXT: s_branch .LBB15_2
@@ -3403,124 +3429,152 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i
; SI-LABEL: bitcast_v16i32_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: v_mov_b32_e32 v4, s17
+; SI-NEXT: v_mov_b32_e32 v5, s18
+; SI-NEXT: v_mov_b32_e32 v6, s19
+; SI-NEXT: v_mov_b32_e32 v7, s20
+; SI-NEXT: v_mov_b32_e32 v8, s21
+; SI-NEXT: v_mov_b32_e32 v9, s22
+; SI-NEXT: v_mov_b32_e32 v10, s23
+; SI-NEXT: v_mov_b32_e32 v11, s24
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s26
+; SI-NEXT: v_mov_b32_e32 v14, s27
+; SI-NEXT: v_mov_b32_e32 v15, s28
+; SI-NEXT: v_mov_b32_e32 v16, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: v_readfirstlane_b32 s21, v3
+; SI-NEXT: v_readfirstlane_b32 s20, v4
+; SI-NEXT: v_readfirstlane_b32 s19, v5
+; SI-NEXT: v_readfirstlane_b32 s18, v6
+; SI-NEXT: v_readfirstlane_b32 s17, v7
+; SI-NEXT: v_readfirstlane_b32 s16, v8
+; SI-NEXT: v_readfirstlane_b32 s15, v9
+; SI-NEXT: v_readfirstlane_b32 s14, v10
+; SI-NEXT: v_readfirstlane_b32 s13, v11
+; SI-NEXT: v_readfirstlane_b32 s12, v12
+; SI-NEXT: v_readfirstlane_b32 s11, v13
+; SI-NEXT: v_readfirstlane_b32 s10, v14
+; SI-NEXT: v_readfirstlane_b32 s8, v15
+; SI-NEXT: v_readfirstlane_b32 s7, v16
; SI-NEXT: v_readfirstlane_b32 s6, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s7, v1
+; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s7, 16
+; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v31, s4
; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT: s_lshr_b32 s4, s29, 16
+; SI-NEXT: s_lshr_b32 s4, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT: s_lshr_b32 s4, s28, 16
+; SI-NEXT: s_lshr_b32 s4, s8, 16
; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT: s_lshr_b32 s4, s27, 16
+; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT: s_lshr_b32 s4, s26, 16
+; SI-NEXT: s_lshr_b32 s4, s11, 16
; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT: s_lshr_b32 s4, s25, 16
+; SI-NEXT: s_lshr_b32 s4, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT: s_lshr_b32 s4, s24, 16
+; SI-NEXT: s_lshr_b32 s4, s13, 16
; SI-NEXT: v_cvt_f32_f16_e32 v17, s4
-; SI-NEXT: s_lshr_b32 s4, s23, 16
+; SI-NEXT: s_lshr_b32 s4, s14, 16
; SI-NEXT: v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT: s_lshr_b32 s4, s22, 16
+; SI-NEXT: s_lshr_b32 s4, s15, 16
; SI-NEXT: v_cvt_f32_f16_e32 v13, s4
-; SI-NEXT: s_lshr_b32 s4, s21, 16
+; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT: s_lshr_b32 s4, s20, 16
+; SI-NEXT: s_lshr_b32 s4, s17, 16
; SI-NEXT: v_cvt_f32_f16_e32 v9, s4
-; SI-NEXT: s_lshr_b32 s4, s19, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: s_lshr_b32 s4, s18, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
+; SI-NEXT: s_lshr_b32 s4, s19, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT: s_lshr_b32 s4, s17, 16
+; SI-NEXT: s_lshr_b32 s4, s20, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: s_lshr_b32 s4, s16, 16
+; SI-NEXT: s_lshr_b32 s4, s21, 16
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v30, s9
; SI-NEXT: v_cvt_f32_f16_e32 v28, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v26, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v24, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v22, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v20, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s12
+; SI-NEXT: v_cvt_f32_f16_e32 v16, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s14
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s15
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s18
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s19
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s20
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s21
; SI-NEXT: s_cbranch_execnz .LBB17_3
; SI-NEXT: .LBB17_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: s_add_i32 s17, s17, 3
-; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_add_i32 s20, s20, 3
; SI-NEXT: s_add_i32 s21, s21, 3
-; SI-NEXT: s_add_i32 s22, s22, 3
-; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_add_i32 s24, s24, 3
-; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_add_i32 s28, s28, 3
-; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: s_add_i32 s6, s6, 3
+; SI-NEXT: s_add_i32 s20, s20, 3
+; SI-NEXT: s_add_i32 s19, s19, 3
+; SI-NEXT: s_add_i32 s18, s18, 3
+; SI-NEXT: s_add_i32 s17, s17, 3
+; SI-NEXT: s_add_i32 s16, s16, 3
+; SI-NEXT: s_add_i32 s15, s15, 3
+; SI-NEXT: s_add_i32 s14, s14, 3
+; SI-NEXT: s_add_i32 s13, s13, 3
+; SI-NEXT: s_add_i32 s12, s12, 3
+; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: s_add_i32 s10, s10, 3
+; SI-NEXT: s_add_i32 s8, s8, 3
; SI-NEXT: s_add_i32 s7, s7, 3
-; SI-NEXT: s_lshr_b32 s4, s16, 16
-; SI-NEXT: s_lshr_b32 s5, s17, 16
-; SI-NEXT: s_lshr_b32 s8, s18, 16
-; SI-NEXT: s_lshr_b32 s9, s19, 16
-; SI-NEXT: s_lshr_b32 s10, s20, 16
-; SI-NEXT: s_lshr_b32 s11, s21, 16
-; SI-NEXT: s_lshr_b32 s12, s22, 16
-; SI-NEXT: s_lshr_b32 s13, s23, 16
-; SI-NEXT: s_lshr_b32 s14, s24, 16
-; SI-NEXT: s_lshr_b32 s15, s25, 16
-; SI-NEXT: s_lshr_b32 s40, s26, 16
-; SI-NEXT: s_lshr_b32 s41, s27, 16
-; SI-NEXT: s_lshr_b32 s42, s28, 16
-; SI-NEXT: s_lshr_b32 s43, s29, 16
+; SI-NEXT: s_add_i32 s6, s6, 3
+; SI-NEXT: s_add_i32 s9, s9, 3
+; SI-NEXT: s_lshr_b32 s4, s21, 16
+; SI-NEXT: s_lshr_b32 s5, s20, 16
+; SI-NEXT: s_lshr_b32 s22, s19, 16
+; SI-NEXT: s_lshr_b32 s23, s18, 16
+; SI-NEXT: s_lshr_b32 s24, s17, 16
+; SI-NEXT: s_lshr_b32 s25, s16, 16
+; SI-NEXT: s_lshr_b32 s26, s15, 16
+; SI-NEXT: s_lshr_b32 s27, s14, 16
+; SI-NEXT: s_lshr_b32 s28, s13, 16
+; SI-NEXT: s_lshr_b32 s29, s12, 16
+; SI-NEXT: s_lshr_b32 s40, s11, 16
+; SI-NEXT: s_lshr_b32 s41, s10, 16
+; SI-NEXT: s_lshr_b32 s42, s8, 16
+; SI-NEXT: s_lshr_b32 s43, s7, 16
; SI-NEXT: s_lshr_b32 s44, s6, 16
-; SI-NEXT: s_lshr_b32 s45, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s7
+; SI-NEXT: s_lshr_b32 s45, s9, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v30, s9
; SI-NEXT: v_cvt_f32_f16_e32 v28, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v26, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v24, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v22, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v20, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s12
+; SI-NEXT: v_cvt_f32_f16_e32 v16, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s14
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s15
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s18
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s19
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s20
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s21
; SI-NEXT: v_cvt_f32_f16_e32 v31, s45
; SI-NEXT: v_cvt_f32_f16_e32 v29, s44
; SI-NEXT: v_cvt_f32_f16_e32 v27, s43
; SI-NEXT: v_cvt_f32_f16_e32 v25, s42
; SI-NEXT: v_cvt_f32_f16_e32 v23, s41
; SI-NEXT: v_cvt_f32_f16_e32 v21, s40
-; SI-NEXT: v_cvt_f32_f16_e32 v19, s15
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s14
-; SI-NEXT: v_cvt_f32_f16_e32 v15, s13
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s12
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s11
-; SI-NEXT: v_cvt_f32_f16_e32 v9, s10
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v19, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v17, s28
+; SI-NEXT: v_cvt_f32_f16_e32 v15, s27
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s26
+; SI-NEXT: v_cvt_f32_f16_e32 v11, s25
+; SI-NEXT: v_cvt_f32_f16_e32 v9, s24
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s23
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s22
; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: .LBB17_3: ; %end
@@ -4788,7 +4842,35 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a
; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: v_mov_b32_e32 v4, s17
+; SI-NEXT: v_mov_b32_e32 v5, s18
+; SI-NEXT: v_mov_b32_e32 v6, s19
+; SI-NEXT: v_mov_b32_e32 v7, s20
+; SI-NEXT: v_mov_b32_e32 v8, s21
+; SI-NEXT: v_mov_b32_e32 v9, s22
+; SI-NEXT: v_mov_b32_e32 v10, s23
+; SI-NEXT: v_mov_b32_e32 v11, s24
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s26
+; SI-NEXT: v_mov_b32_e32 v14, s27
+; SI-NEXT: v_mov_b32_e32 v15, s28
+; SI-NEXT: v_mov_b32_e32 v16, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: v_readfirstlane_b32 s56, v3
+; SI-NEXT: v_readfirstlane_b32 s57, v4
+; SI-NEXT: v_readfirstlane_b32 s58, v5
+; SI-NEXT: v_readfirstlane_b32 s59, v6
+; SI-NEXT: v_readfirstlane_b32 s60, v7
+; SI-NEXT: v_readfirstlane_b32 s61, v8
+; SI-NEXT: v_readfirstlane_b32 s62, v9
+; SI-NEXT: v_readfirstlane_b32 s63, v10
+; SI-NEXT: v_readfirstlane_b32 s72, v11
+; SI-NEXT: v_readfirstlane_b32 s73, v12
+; SI-NEXT: v_readfirstlane_b32 s74, v13
+; SI-NEXT: v_readfirstlane_b32 s75, v14
+; SI-NEXT: v_readfirstlane_b32 s76, v15
+; SI-NEXT: v_readfirstlane_b32 s77, v16
; SI-NEXT: v_readfirstlane_b32 s78, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s79, v1
@@ -4798,107 +4880,107 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a
; SI-NEXT: s_lshl_b32 s7, s79, 16
; SI-NEXT: s_and_b32 s8, s78, 0xffff0000
; SI-NEXT: s_lshl_b32 s9, s78, 16
-; SI-NEXT: s_and_b32 s10, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s11, s29, 16
-; SI-NEXT: s_and_b32 s12, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s13, s28, 16
-; SI-NEXT: s_and_b32 s14, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s15, s27, 16
-; SI-NEXT: s_and_b32 s40, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s41, s26, 16
-; SI-NEXT: s_and_b32 s42, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s43, s25, 16
-; SI-NEXT: s_and_b32 s44, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s45, s24, 16
-; SI-NEXT: s_and_b32 s46, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s47, s23, 16
-; SI-NEXT: s_and_b32 s56, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s57, s22, 16
-; SI-NEXT: s_and_b32 s58, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s21, 16
-; SI-NEXT: s_and_b32 s60, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s20, 16
-; SI-NEXT: s_and_b32 s62, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s19, 16
-; SI-NEXT: s_and_b32 s72, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s73, s18, 16
-; SI-NEXT: s_and_b32 s74, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s17, 16
-; SI-NEXT: s_and_b32 s76, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s77, s16, 16
+; SI-NEXT: s_and_b32 s10, s77, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s77, 16
+; SI-NEXT: s_and_b32 s12, s76, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s76, 16
+; SI-NEXT: s_and_b32 s14, s75, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s75, 16
+; SI-NEXT: s_and_b32 s16, s74, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s74, 16
+; SI-NEXT: s_and_b32 s18, s73, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s73, 16
+; SI-NEXT: s_and_b32 s20, s72, 0xffff0000
+; SI-NEXT: s_lshl_b32 s21, s72, 16
+; SI-NEXT: s_and_b32 s22, s63, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s63, 16
+; SI-NEXT: s_and_b32 s24, s62, 0xffff0000
+; SI-NEXT: s_lshl_b32 s25, s62, 16
+; SI-NEXT: s_and_b32 s26, s61, 0xffff0000
+; SI-NEXT: s_lshl_b32 s27, s61, 16
+; SI-NEXT: s_and_b32 s28, s60, 0xffff0000
+; SI-NEXT: s_lshl_b32 s29, s60, 16
+; SI-NEXT: s_and_b32 s40, s59, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s59, 16
+; SI-NEXT: s_and_b32 s42, s58, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s58, 16
+; SI-NEXT: s_and_b32 s44, s57, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s57, 16
+; SI-NEXT: s_and_b32 s46, s56, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s56, 16
; SI-NEXT: s_cbranch_execnz .LBB21_3
; SI-NEXT: .LBB21_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: s_add_i32 s17, s17, 3
-; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: s_add_i32 s21, s21, 3
-; SI-NEXT: s_add_i32 s22, s22, 3
-; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_add_i32 s24, s24, 3
-; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_add_i32 s28, s28, 3
-; SI-NEXT: s_add_i32 s29, s29, 3
+; SI-NEXT: s_add_i32 s56, s56, 3
+; SI-NEXT: s_add_i32 s57, s57, 3
+; SI-NEXT: s_add_i32 s58, s58, 3
+; SI-NEXT: s_add_i32 s59, s59, 3
+; SI-NEXT: s_add_i32 s60, s60, 3
+; SI-NEXT: s_add_i32 s61, s61, 3
+; SI-NEXT: s_add_i32 s62, s62, 3
+; SI-NEXT: s_add_i32 s63, s63, 3
+; SI-NEXT: s_add_i32 s72, s72, 3
+; SI-NEXT: s_add_i32 s73, s73, 3
+; SI-NEXT: s_add_i32 s74, s74, 3
+; SI-NEXT: s_add_i32 s75, s75, 3
+; SI-NEXT: s_add_i32 s76, s76, 3
+; SI-NEXT: s_add_i32 s77, s77, 3
; SI-NEXT: s_add_i32 s78, s78, 3
; SI-NEXT: s_add_i32 s79, s79, 3
; SI-NEXT: s_and_b32 s6, s79, 0xffff0000
; SI-NEXT: s_lshl_b32 s7, s79, 16
; SI-NEXT: s_and_b32 s8, s78, 0xffff0000
; SI-NEXT: s_lshl_b32 s9, s78, 16
-; SI-NEXT: s_and_b32 s10, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s11, s29, 16
-; SI-NEXT: s_and_b32 s12, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s13, s28, 16
-; SI-NEXT: s_and_b32 s14, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s15, s27, 16
-; SI-NEXT: s_and_b32 s40, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s41, s26, 16
-; SI-NEXT: s_and_b32 s42, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s43, s25, 16
-; SI-NEXT: s_and_b32 s44, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s45, s24, 16
-; SI-NEXT: s_and_b32 s46, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s47, s23, 16
-; SI-NEXT: s_and_b32 s56, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s57, s22, 16
-; SI-NEXT: s_and_b32 s58, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s21, 16
-; SI-NEXT: s_and_b32 s60, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s20, 16
-; SI-NEXT: s_and_b32 s62, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s19, 16
-; SI-NEXT: s_and_b32 s72, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s73, s18, 16
-; SI-NEXT: s_and_b32 s74, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s17, 16
-; SI-NEXT: s_and_b32 s76, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s77, s16, 16
+; SI-NEXT: s_and_b32 s10, s77, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s77, 16
+; SI-NEXT: s_and_b32 s12, s76, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s76, 16
+; SI-NEXT: s_and_b32 s14, s75, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s75, 16
+; SI-NEXT: s_and_b32 s16, s74, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s74, 16
+; SI-NEXT: s_and_b32 s18, s73, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s73, 16
+; SI-NEXT: s_and_b32 s20, s72, 0xffff0000
+; SI-NEXT: s_lshl_b32 s21, s72, 16
+; SI-NEXT: s_and_b32 s22, s63, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s63, 16
+; SI-NEXT: s_and_b32 s24, s62, 0xffff0000
+; SI-NEXT: s_lshl_b32 s25, s62, 16
+; SI-NEXT: s_and_b32 s26, s61, 0xffff0000
+; SI-NEXT: s_lshl_b32 s27, s61, 16
+; SI-NEXT: s_and_b32 s28, s60, 0xffff0000
+; SI-NEXT: s_lshl_b32 s29, s60, 16
+; SI-NEXT: s_and_b32 s40, s59, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s59, 16
+; SI-NEXT: s_and_b32 s42, s58, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s58, 16
+; SI-NEXT: s_and_b32 s44, s57, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s57, 16
+; SI-NEXT: s_and_b32 s46, s56, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s56, 16
; SI-NEXT: .LBB21_3: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s77
-; SI-NEXT: v_mov_b32_e32 v1, s76
-; SI-NEXT: v_mov_b32_e32 v2, s75
-; SI-NEXT: v_mov_b32_e32 v3, s74
-; SI-NEXT: v_mov_b32_e32 v4, s73
-; SI-NEXT: v_mov_b32_e32 v5, s72
-; SI-NEXT: v_mov_b32_e32 v6, s63
-; SI-NEXT: v_mov_b32_e32 v7, s62
-; SI-NEXT: v_mov_b32_e32 v8, s61
-; SI-NEXT: v_mov_b32_e32 v9, s60
-; SI-NEXT: v_mov_b32_e32 v10, s59
-; SI-NEXT: v_mov_b32_e32 v11, s58
-; SI-NEXT: v_mov_b32_e32 v12, s57
-; SI-NEXT: v_mov_b32_e32 v13, s56
-; SI-NEXT: v_mov_b32_e32 v14, s47
-; SI-NEXT: v_mov_b32_e32 v15, s46
-; SI-NEXT: v_mov_b32_e32 v16, s45
-; SI-NEXT: v_mov_b32_e32 v17, s44
-; SI-NEXT: v_mov_b32_e32 v18, s43
-; SI-NEXT: v_mov_b32_e32 v19, s42
-; SI-NEXT: v_mov_b32_e32 v20, s41
-; SI-NEXT: v_mov_b32_e32 v21, s40
+; SI-NEXT: v_mov_b32_e32 v0, s47
+; SI-NEXT: v_mov_b32_e32 v1, s46
+; SI-NEXT: v_mov_b32_e32 v2, s45
+; SI-NEXT: v_mov_b32_e32 v3, s44
+; SI-NEXT: v_mov_b32_e32 v4, s43
+; SI-NEXT: v_mov_b32_e32 v5, s42
+; SI-NEXT: v_mov_b32_e32 v6, s41
+; SI-NEXT: v_mov_b32_e32 v7, s40
+; SI-NEXT: v_mov_b32_e32 v8, s29
+; SI-NEXT: v_mov_b32_e32 v9, s28
+; SI-NEXT: v_mov_b32_e32 v10, s27
+; SI-NEXT: v_mov_b32_e32 v11, s26
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s24
+; SI-NEXT: v_mov_b32_e32 v14, s23
+; SI-NEXT: v_mov_b32_e32 v15, s22
+; SI-NEXT: v_mov_b32_e32 v16, s21
+; SI-NEXT: v_mov_b32_e32 v17, s20
+; SI-NEXT: v_mov_b32_e32 v18, s19
+; SI-NEXT: v_mov_b32_e32 v19, s18
+; SI-NEXT: v_mov_b32_e32 v20, s17
+; SI-NEXT: v_mov_b32_e32 v21, s16
; SI-NEXT: v_mov_b32_e32 v22, s15
; SI-NEXT: v_mov_b32_e32 v23, s14
; SI-NEXT: v_mov_b32_e32 v24, s13
@@ -4911,20 +4993,6 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a
; SI-NEXT: v_mov_b32_e32 v31, s6
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB21_4:
-; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr47
; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $sgpr45
@@ -4933,6 +5001,20 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr29
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr27
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr25
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr21
+; SI-NEXT: ; implicit-def: $sgpr20
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr15
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr13
@@ -6495,172 +6577,209 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16
+; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28
; SI-NEXT: s_cbranch_scc0 .LBB23_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41
+; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61
+; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59
+; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58
+; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57
+; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56
+; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47
+; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46
+; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45
+; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44
+; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
+; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42
+; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16
+; SI-NEXT: v_mov_b32_e32 v18, v20
; SI-NEXT: s_cbranch_execnz .LBB23_3
; SI-NEXT: .LBB23_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
; SI-NEXT: .LBB23_3: ; %end
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB23_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -6669,687 +6788,665 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v19, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v19, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v10, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v7, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v9, s25
+; VI-NEXT: v_mov_b32_e32 v11, s27
+; VI-NEXT: v_mov_b32_e32 v13, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB23_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB23_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB23_4
+; VI-NEXT: s_cbranch_execnz .LBB23_3
; VI-NEXT: .LBB23_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v0
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v0
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19]
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20]
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20]
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21]
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22]
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22]
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23]
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24]
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24]
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
-; VI-NEXT: s_branch .LBB23_5
-; VI-NEXT: .LBB23_3:
-; VI-NEXT: s_branch .LBB23_2
-; VI-NEXT: .LBB23_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB23_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v19, 1
-; VI-NEXT: v_readlane_b32 s30, v19, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25]
+; VI-NEXT: v_mov_b32_e32 v1, v23
+; VI-NEXT: v_mov_b32_e32 v3, v22
+; VI-NEXT: v_mov_b32_e32 v5, v21
+; VI-NEXT: v_mov_b32_e32 v7, v20
+; VI-NEXT: v_mov_b32_e32 v9, v19
+; VI-NEXT: v_mov_b32_e32 v11, v18
+; VI-NEXT: v_mov_b32_e32 v13, v17
+; VI-NEXT: v_mov_b32_e32 v15, v16
+; VI-NEXT: .LBB23_3: ; %end
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB23_4:
+; VI-NEXT: s_branch .LBB23_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v16i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s16
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-NEXT: v_mov_b32_e32 v4, s20
+; GFX9-NEXT: v_mov_b32_e32 v5, s21
+; GFX9-NEXT: v_mov_b32_e32 v6, s22
+; GFX9-NEXT: v_mov_b32_e32 v7, s23
+; GFX9-NEXT: v_mov_b32_e32 v8, s24
+; GFX9-NEXT: v_mov_b32_e32 v9, s25
+; GFX9-NEXT: v_mov_b32_e32 v10, s26
+; GFX9-NEXT: v_mov_b32_e32 v11, s27
+; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX9-NEXT: v_mov_b32_e32 v13, s29
+; GFX9-NEXT: s_cbranch_scc0 .LBB23_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB23_4
+; GFX9-NEXT: s_cbranch_execnz .LBB23_3
; GFX9-NEXT: .LBB23_2: ; %cmp.true
-; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s4, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s4, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v16, v16, v15
+; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc
; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16
-; GFX9-NEXT: s_branch .LBB23_5
-; GFX9-NEXT: .LBB23_3:
-; GFX9-NEXT: s_branch .LBB23_2
-; GFX9-NEXT: .LBB23_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: v_mov_b32_e32 v4, s20
-; GFX9-NEXT: v_mov_b32_e32 v5, s21
-; GFX9-NEXT: v_mov_b32_e32 v6, s22
-; GFX9-NEXT: v_mov_b32_e32 v7, s23
-; GFX9-NEXT: v_mov_b32_e32 v8, s24
-; GFX9-NEXT: v_mov_b32_e32 v9, s25
-; GFX9-NEXT: v_mov_b32_e32 v10, s26
-; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: v_mov_b32_e32 v12, s28
-; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB23_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0
+; GFX9-NEXT: .LBB23_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB23_4:
+; GFX9-NEXT: s_branch .LBB23_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -8424,22 +8521,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -8475,6 +8556,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8757,22 +8854,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -8806,6 +8887,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9077,25 +9174,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9122,6 +9203,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -9704,345 +9801,373 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s30, 0
-; SI-NEXT: v_writelane_b32 v4, s31, 1
-; SI-NEXT: v_writelane_b32 v4, s34, 2
-; SI-NEXT: v_writelane_b32 v4, s35, 3
-; SI-NEXT: v_writelane_b32 v4, s36, 4
-; SI-NEXT: v_writelane_b32 v4, s37, 5
-; SI-NEXT: v_writelane_b32 v4, s38, 6
-; SI-NEXT: v_writelane_b32 v4, s39, 7
-; SI-NEXT: v_writelane_b32 v4, s48, 8
-; SI-NEXT: v_writelane_b32 v4, s49, 9
-; SI-NEXT: v_writelane_b32 v4, s50, 10
-; SI-NEXT: v_writelane_b32 v4, s51, 11
-; SI-NEXT: v_writelane_b32 v4, s52, 12
-; SI-NEXT: v_writelane_b32 v4, s53, 13
-; SI-NEXT: v_writelane_b32 v4, s54, 14
-; SI-NEXT: v_writelane_b32 v4, s55, 15
-; SI-NEXT: v_writelane_b32 v4, s64, 16
-; SI-NEXT: v_writelane_b32 v4, s65, 17
-; SI-NEXT: v_writelane_b32 v4, s66, 18
-; SI-NEXT: v_writelane_b32 v4, s67, 19
-; SI-NEXT: v_writelane_b32 v4, s68, 20
-; SI-NEXT: v_writelane_b32 v4, s69, 21
-; SI-NEXT: v_writelane_b32 v4, s70, 22
-; SI-NEXT: v_writelane_b32 v4, s71, 23
-; SI-NEXT: v_writelane_b32 v4, s80, 24
-; SI-NEXT: v_writelane_b32 v4, s81, 25
-; SI-NEXT: v_writelane_b32 v4, s82, 26
-; SI-NEXT: v_writelane_b32 v4, s83, 27
+; SI-NEXT: v_writelane_b32 v18, s30, 0
+; SI-NEXT: v_writelane_b32 v18, s31, 1
+; SI-NEXT: v_writelane_b32 v18, s34, 2
+; SI-NEXT: v_writelane_b32 v18, s35, 3
+; SI-NEXT: v_writelane_b32 v18, s36, 4
+; SI-NEXT: v_writelane_b32 v18, s37, 5
+; SI-NEXT: v_writelane_b32 v18, s38, 6
+; SI-NEXT: v_writelane_b32 v18, s39, 7
+; SI-NEXT: v_writelane_b32 v18, s48, 8
+; SI-NEXT: v_writelane_b32 v18, s49, 9
+; SI-NEXT: v_writelane_b32 v18, s50, 10
+; SI-NEXT: v_writelane_b32 v18, s51, 11
+; SI-NEXT: v_writelane_b32 v18, s52, 12
+; SI-NEXT: v_writelane_b32 v18, s53, 13
+; SI-NEXT: v_writelane_b32 v18, s54, 14
+; SI-NEXT: v_writelane_b32 v18, s55, 15
+; SI-NEXT: v_writelane_b32 v18, s64, 16
+; SI-NEXT: v_writelane_b32 v18, s65, 17
+; SI-NEXT: v_writelane_b32 v18, s66, 18
+; SI-NEXT: v_writelane_b32 v18, s67, 19
+; SI-NEXT: v_writelane_b32 v18, s68, 20
+; SI-NEXT: v_writelane_b32 v18, s69, 21
+; SI-NEXT: v_writelane_b32 v18, s70, 22
+; SI-NEXT: v_writelane_b32 v18, s71, 23
+; SI-NEXT: v_writelane_b32 v18, s80, 24
+; SI-NEXT: v_writelane_b32 v18, s81, 25
+; SI-NEXT: v_writelane_b32 v18, s82, 26
+; SI-NEXT: v_writelane_b32 v18, s83, 27
+; SI-NEXT: v_mov_b32_e32 v4, s16
+; SI-NEXT: v_mov_b32_e32 v5, s17
+; SI-NEXT: v_mov_b32_e32 v6, s18
+; SI-NEXT: v_mov_b32_e32 v7, s19
+; SI-NEXT: v_mov_b32_e32 v8, s20
+; SI-NEXT: v_mov_b32_e32 v9, s21
+; SI-NEXT: v_mov_b32_e32 v10, s22
+; SI-NEXT: v_mov_b32_e32 v11, s23
+; SI-NEXT: v_mov_b32_e32 v12, s24
+; SI-NEXT: v_mov_b32_e32 v13, s25
+; SI-NEXT: v_mov_b32_e32 v14, s26
+; SI-NEXT: v_mov_b32_e32 v15, s27
+; SI-NEXT: v_mov_b32_e32 v16, s28
+; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v4, s84, 28
+; SI-NEXT: v_writelane_b32 v18, s84, 28
+; SI-NEXT: v_readfirstlane_b32 s20, v4
+; SI-NEXT: v_readfirstlane_b32 s21, v5
+; SI-NEXT: v_readfirstlane_b32 s16, v6
+; SI-NEXT: v_readfirstlane_b32 s17, v7
+; SI-NEXT: v_readfirstlane_b32 s14, v8
+; SI-NEXT: v_readfirstlane_b32 s15, v9
+; SI-NEXT: v_readfirstlane_b32 s12, v10
+; SI-NEXT: v_readfirstlane_b32 s13, v11
+; SI-NEXT: v_readfirstlane_b32 s10, v12
+; SI-NEXT: v_readfirstlane_b32 s11, v13
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s9, v15
+; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-NEXT: s_and_b64 s[18:19], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
-; SI-NEXT: v_writelane_b32 v4, s85, 29
+; SI-NEXT: v_writelane_b32 v18, s85, 29
; SI-NEXT: s_cbranch_scc0 .LBB25_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
; SI-NEXT: s_lshr_b32 s39, s5, 16
; SI-NEXT: s_lshr_b32 s48, s5, 8
-; SI-NEXT: s_lshr_b32 s49, s29, 24
-; SI-NEXT: s_lshr_b32 s50, s29, 16
-; SI-NEXT: s_lshr_b32 s51, s29, 8
-; SI-NEXT: s_lshr_b32 s52, s27, 24
-; SI-NEXT: s_lshr_b32 s53, s27, 16
-; SI-NEXT: s_lshr_b32 s54, s27, 8
-; SI-NEXT: s_lshr_b32 s55, s25, 24
-; SI-NEXT: s_lshr_b32 s64, s25, 16
-; SI-NEXT: s_lshr_b32 s65, s25, 8
-; SI-NEXT: s_lshr_b32 s66, s23, 24
-; SI-NEXT: s_lshr_b32 s67, s23, 16
-; SI-NEXT: s_lshr_b32 s68, s23, 8
-; SI-NEXT: s_lshr_b32 s69, s21, 24
-; SI-NEXT: s_lshr_b32 s70, s21, 16
-; SI-NEXT: s_lshr_b32 s71, s21, 8
-; SI-NEXT: s_lshr_b32 s80, s19, 24
-; SI-NEXT: s_lshr_b32 s81, s19, 16
-; SI-NEXT: s_lshr_b32 s82, s19, 8
-; SI-NEXT: s_lshr_b32 s83, s17, 24
-; SI-NEXT: s_lshr_b32 s84, s17, 16
-; SI-NEXT: s_lshr_b32 s85, s17, 8
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8
-; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s49, s7, 24
+; SI-NEXT: s_lshr_b32 s50, s7, 16
+; SI-NEXT: s_lshr_b32 s51, s7, 8
+; SI-NEXT: s_lshr_b32 s52, s9, 24
+; SI-NEXT: s_lshr_b32 s53, s9, 16
+; SI-NEXT: s_lshr_b32 s54, s9, 8
+; SI-NEXT: s_lshr_b32 s55, s11, 24
+; SI-NEXT: s_lshr_b32 s64, s11, 16
+; SI-NEXT: s_lshr_b32 s65, s11, 8
+; SI-NEXT: s_lshr_b32 s66, s13, 24
+; SI-NEXT: s_lshr_b32 s67, s13, 16
+; SI-NEXT: s_lshr_b32 s68, s13, 8
+; SI-NEXT: s_lshr_b32 s69, s15, 24
+; SI-NEXT: s_lshr_b32 s70, s15, 16
+; SI-NEXT: s_lshr_b32 s71, s15, 8
+; SI-NEXT: s_lshr_b32 s80, s17, 24
+; SI-NEXT: s_lshr_b32 s81, s17, 16
+; SI-NEXT: s_lshr_b32 s82, s17, 8
+; SI-NEXT: s_lshr_b32 s83, s21, 24
+; SI-NEXT: s_lshr_b32 s84, s21, 16
+; SI-NEXT: s_lshr_b32 s85, s21, 8
+; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24
+; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16
+; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8
+; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24
+; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24
+; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24
+; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16
+; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8
; SI-NEXT: s_cbranch_execnz .LBB25_3
; SI-NEXT: .LBB25_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s17, s17, 3
-; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_add_i32 s22, s22, 3
-; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_add_i32 s24, s24, 3
-; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: s_add_i32 s28, s28, 3
+; SI-NEXT: s_add_i32 s17, s17, 3
+; SI-NEXT: s_add_i32 s16, s16, 3
+; SI-NEXT: s_add_i32 s15, s15, 3
+; SI-NEXT: s_add_i32 s14, s14, 3
+; SI-NEXT: s_add_i32 s13, s13, 3
+; SI-NEXT: s_add_i32 s12, s12, 3
+; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: s_add_i32 s10, s10, 3
+; SI-NEXT: s_add_i32 s9, s9, 3
+; SI-NEXT: s_add_i32 s8, s8, 3
+; SI-NEXT: s_add_i32 s7, s7, 3
+; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: s_add_i32 s5, s5, 3
; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8
-; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16
+; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24
+; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16
+; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8
+; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24
+; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16
; SI-NEXT: s_lshr_b32 s38, s5, 24
; SI-NEXT: s_lshr_b32 s39, s5, 16
; SI-NEXT: s_lshr_b32 s48, s5, 8
-; SI-NEXT: s_lshr_b32 s49, s29, 24
-; SI-NEXT: s_lshr_b32 s50, s29, 16
-; SI-NEXT: s_lshr_b32 s51, s29, 8
-; SI-NEXT: s_lshr_b32 s52, s27, 24
-; SI-NEXT: s_lshr_b32 s53, s27, 16
-; SI-NEXT: s_lshr_b32 s54, s27, 8
-; SI-NEXT: s_lshr_b32 s55, s25, 24
-; SI-NEXT: s_lshr_b32 s64, s25, 16
-; SI-NEXT: s_lshr_b32 s65, s25, 8
-; SI-NEXT: s_lshr_b32 s66, s23, 24
-; SI-NEXT: s_lshr_b32 s67, s23, 16
-; SI-NEXT: s_lshr_b32 s68, s23, 8
-; SI-NEXT: s_lshr_b32 s69, s21, 24
-; SI-NEXT: s_lshr_b32 s70, s21, 16
-; SI-NEXT: s_lshr_b32 s71, s21, 8
-; SI-NEXT: s_lshr_b32 s80, s19, 24
-; SI-NEXT: s_lshr_b32 s81, s19, 16
-; SI-NEXT: s_lshr_b32 s82, s19, 8
-; SI-NEXT: s_lshr_b32 s83, s17, 24
-; SI-NEXT: s_lshr_b32 s84, s17, 16
-; SI-NEXT: s_lshr_b32 s85, s17, 8
-; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s49, s7, 24
+; SI-NEXT: s_lshr_b32 s50, s7, 16
+; SI-NEXT: s_lshr_b32 s51, s7, 8
+; SI-NEXT: s_lshr_b32 s52, s9, 24
+; SI-NEXT: s_lshr_b32 s53, s9, 16
+; SI-NEXT: s_lshr_b32 s54, s9, 8
+; SI-NEXT: s_lshr_b32 s55, s11, 24
+; SI-NEXT: s_lshr_b32 s64, s11, 16
+; SI-NEXT: s_lshr_b32 s65, s11, 8
+; SI-NEXT: s_lshr_b32 s66, s13, 24
+; SI-NEXT: s_lshr_b32 s67, s13, 16
+; SI-NEXT: s_lshr_b32 s68, s13, 8
+; SI-NEXT: s_lshr_b32 s69, s15, 24
+; SI-NEXT: s_lshr_b32 s70, s15, 16
+; SI-NEXT: s_lshr_b32 s71, s15, 8
+; SI-NEXT: s_lshr_b32 s80, s17, 24
+; SI-NEXT: s_lshr_b32 s81, s17, 16
+; SI-NEXT: s_lshr_b32 s82, s17, 8
+; SI-NEXT: s_lshr_b32 s83, s21, 24
+; SI-NEXT: s_lshr_b32 s84, s21, 16
+; SI-NEXT: s_lshr_b32 s85, s21, 8
+; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24
+; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24
+; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16
+; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8
; SI-NEXT: .LBB25_3: ; %end
-; SI-NEXT: s_lshl_b32 s7, s36, 8
-; SI-NEXT: s_and_b32 s9, s16, 0xff
-; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: s_and_b32 s9, s34, 0xff
-; SI-NEXT: s_lshl_b32 s11, s30, 24
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: s_and_b32 s7, s17, 0xff
-; SI-NEXT: s_lshl_b32 s9, s85, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s84, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s83, 24
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_lshl_b32 s7, s94, 8
-; SI-NEXT: s_and_b32 s9, s18, 0xff
-; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: s_and_b32 s9, s92, 0xff
-; SI-NEXT: s_lshl_b32 s11, s90, 24
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: s_and_b32 s7, s19, 0xff
-; SI-NEXT: s_lshl_b32 s9, s82, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s81, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s80, 24
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_lshl_b32 s19, s36, 8
+; SI-NEXT: s_and_b32 s20, s20, 0xff
+; SI-NEXT: s_or_b32 s19, s20, s19
+; SI-NEXT: s_and_b32 s20, s34, 0xff
+; SI-NEXT: s_lshl_b32 s23, s30, 24
+; SI-NEXT: s_lshl_b32 s20, s20, 16
+; SI-NEXT: s_or_b32 s20, s23, s20
+; SI-NEXT: s_and_b32 s19, s19, 0xffff
+; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: v_mov_b32_e32 v1, s19
+; SI-NEXT: s_and_b32 s19, s21, 0xff
+; SI-NEXT: s_lshl_b32 s20, s85, 8
+; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: s_and_b32 s20, s84, 0xff
+; SI-NEXT: s_lshl_b32 s20, s20, 16
+; SI-NEXT: s_lshl_b32 s21, s83, 24
+; SI-NEXT: s_or_b32 s20, s21, s20
+; SI-NEXT: s_and_b32 s19, s19, 0xffff
+; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: v_mov_b32_e32 v2, s19
+; SI-NEXT: s_lshl_b32 s19, s94, 8
+; SI-NEXT: s_and_b32 s16, s16, 0xff
+; SI-NEXT: s_or_b32 s16, s16, s19
+; SI-NEXT: s_and_b32 s19, s92, 0xff
+; SI-NEXT: s_lshl_b32 s20, s90, 24
+; SI-NEXT: s_lshl_b32 s19, s19, 16
+; SI-NEXT: s_or_b32 s19, s20, s19
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: s_or_b32 s16, s16, s19
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: s_and_b32 s16, s17, 0xff
+; SI-NEXT: s_lshl_b32 s17, s82, 8
+; SI-NEXT: s_or_b32 s16, s16, s17
+; SI-NEXT: s_and_b32 s17, s81, 0xff
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_lshl_b32 s19, s80, 24
+; SI-NEXT: s_or_b32 s17, s19, s17
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s20, 0xff
-; SI-NEXT: s_lshl_b32 s9, s78, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s76, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s74, 24
+; SI-NEXT: v_mov_b32_e32 v2, s16
+; SI-NEXT: s_and_b32 s14, s14, 0xff
+; SI-NEXT: s_lshl_b32 s16, s78, 8
+; SI-NEXT: s_or_b32 s14, s14, s16
+; SI-NEXT: s_and_b32 s16, s76, 0xff
+; SI-NEXT: s_lshl_b32 s16, s16, 16
+; SI-NEXT: s_lshl_b32 s17, s74, 24
; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_and_b32 s14, s14, 0xffff
+; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s21, 0xff
-; SI-NEXT: s_lshl_b32 s9, s71, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s70, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s69, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s14
+; SI-NEXT: s_and_b32 s14, s15, 0xff
+; SI-NEXT: s_lshl_b32 s15, s71, 8
+; SI-NEXT: s_or_b32 s14, s14, s15
+; SI-NEXT: s_and_b32 s15, s70, 0xff
+; SI-NEXT: s_lshl_b32 s15, s15, 16
+; SI-NEXT: s_lshl_b32 s16, s69, 24
+; SI-NEXT: s_and_b32 s14, s14, 0xffff
+; SI-NEXT: s_or_b32 s15, s16, s15
; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s22, 0xff
-; SI-NEXT: s_lshl_b32 s9, s88, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s72, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s62, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s14
+; SI-NEXT: s_and_b32 s12, s12, 0xff
+; SI-NEXT: s_lshl_b32 s14, s88, 8
+; SI-NEXT: s_or_b32 s12, s12, s14
+; SI-NEXT: s_and_b32 s14, s72, 0xff
+; SI-NEXT: s_lshl_b32 s14, s14, 16
+; SI-NEXT: s_lshl_b32 s15, s62, 24
+; SI-NEXT: s_and_b32 s12, s12, 0xffff
+; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s12, s12, s14
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s23, 0xff
-; SI-NEXT: s_lshl_b32 s9, s68, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s67, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s66, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s12
+; SI-NEXT: s_and_b32 s12, s13, 0xff
+; SI-NEXT: s_lshl_b32 s13, s68, 8
+; SI-NEXT: s_or_b32 s12, s12, s13
+; SI-NEXT: s_and_b32 s13, s67, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s14, s66, 24
+; SI-NEXT: s_and_b32 s12, s12, 0xffff
+; SI-NEXT: s_or_b32 s13, s14, s13
; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s24, 0xff
-; SI-NEXT: s_lshl_b32 s9, s60, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s58, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s56, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s12
+; SI-NEXT: s_and_b32 s10, s10, 0xff
+; SI-NEXT: s_lshl_b32 s12, s60, 8
+; SI-NEXT: s_or_b32 s10, s10, s12
+; SI-NEXT: s_and_b32 s12, s58, 0xff
+; SI-NEXT: s_lshl_b32 s12, s12, 16
+; SI-NEXT: s_lshl_b32 s13, s56, 24
+; SI-NEXT: s_and_b32 s10, s10, 0xffff
+; SI-NEXT: s_or_b32 s12, s13, s12
; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s10, s10, s12
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s25, 0xff
-; SI-NEXT: s_lshl_b32 s9, s65, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s64, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s55, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: s_and_b32 s10, s11, 0xff
+; SI-NEXT: s_lshl_b32 s11, s65, 8
+; SI-NEXT: s_or_b32 s10, s10, s11
+; SI-NEXT: s_and_b32 s11, s64, 0xff
+; SI-NEXT: s_lshl_b32 s11, s11, 16
+; SI-NEXT: s_lshl_b32 s12, s55, 24
+; SI-NEXT: s_and_b32 s10, s10, 0xffff
+; SI-NEXT: s_or_b32 s11, s12, s11
; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s26, 0xff
-; SI-NEXT: s_lshl_b32 s9, s46, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s44, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: s_and_b32 s8, s8, 0xff
+; SI-NEXT: s_lshl_b32 s10, s46, 8
+; SI-NEXT: s_or_b32 s8, s8, s10
+; SI-NEXT: s_and_b32 s10, s44, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 16
; SI-NEXT: s_lshl_b32 s11, s42, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_and_b32 s8, s8, 0xffff
+; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s10
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s27, 0xff
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: s_and_b32 s8, s9, 0xff
; SI-NEXT: s_lshl_b32 s9, s54, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s53, 0xff
; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s52, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_lshl_b32 s10, s52, 24
+; SI-NEXT: s_and_b32 s8, s8, 0xffff
+; SI-NEXT: s_or_b32 s9, s10, s9
; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s28, 0xff
-; SI-NEXT: s_lshl_b32 s9, s40, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s14, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s12, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: s_and_b32 s6, s6, 0xff
+; SI-NEXT: s_lshl_b32 s8, s40, 8
+; SI-NEXT: s_or_b32 s6, s6, s8
+; SI-NEXT: s_and_b32 s8, s28, 0xff
+; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_lshl_b32 s9, s26, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s8, s9, s8
; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s6, s6, s8
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s29, 0xff
-; SI-NEXT: s_lshl_b32 s9, s51, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s50, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s49, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_and_b32 s6, s7, 0xff
+; SI-NEXT: s_lshl_b32 s7, s51, 8
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_and_b32 s7, s50, 0xff
+; SI-NEXT: s_lshl_b32 s7, s7, 16
+; SI-NEXT: s_lshl_b32 s8, s49, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s7, s8, s7
; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s7, s10, 8
-; SI-NEXT: s_or_b32 s4, s4, s7
-; SI-NEXT: s_and_b32 s7, s8, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 16
-; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: s_lshl_b32 s6, s24, 8
+; SI-NEXT: s_or_b32 s4, s4, s6
+; SI-NEXT: s_and_b32 s6, s22, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: s_lshl_b32 s7, s18, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_or_b32 s6, s7, s6
; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -10062,38 +10187,38 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s85, v4, 29
-; SI-NEXT: v_readlane_b32 s84, v4, 28
-; SI-NEXT: v_readlane_b32 s83, v4, 27
-; SI-NEXT: v_readlane_b32 s82, v4, 26
-; SI-NEXT: v_readlane_b32 s81, v4, 25
-; SI-NEXT: v_readlane_b32 s80, v4, 24
-; SI-NEXT: v_readlane_b32 s71, v4, 23
-; SI-NEXT: v_readlane_b32 s70, v4, 22
-; SI-NEXT: v_readlane_b32 s69, v4, 21
-; SI-NEXT: v_readlane_b32 s68, v4, 20
-; SI-NEXT: v_readlane_b32 s67, v4, 19
-; SI-NEXT: v_readlane_b32 s66, v4, 18
-; SI-NEXT: v_readlane_b32 s65, v4, 17
-; SI-NEXT: v_readlane_b32 s64, v4, 16
-; SI-NEXT: v_readlane_b32 s55, v4, 15
-; SI-NEXT: v_readlane_b32 s54, v4, 14
-; SI-NEXT: v_readlane_b32 s53, v4, 13
-; SI-NEXT: v_readlane_b32 s52, v4, 12
-; SI-NEXT: v_readlane_b32 s51, v4, 11
-; SI-NEXT: v_readlane_b32 s50, v4, 10
-; SI-NEXT: v_readlane_b32 s49, v4, 9
-; SI-NEXT: v_readlane_b32 s48, v4, 8
-; SI-NEXT: v_readlane_b32 s39, v4, 7
-; SI-NEXT: v_readlane_b32 s38, v4, 6
-; SI-NEXT: v_readlane_b32 s37, v4, 5
-; SI-NEXT: v_readlane_b32 s36, v4, 4
-; SI-NEXT: v_readlane_b32 s35, v4, 3
-; SI-NEXT: v_readlane_b32 s34, v4, 2
-; SI-NEXT: v_readlane_b32 s31, v4, 1
-; SI-NEXT: v_readlane_b32 s30, v4, 0
+; SI-NEXT: v_readlane_b32 s85, v18, 29
+; SI-NEXT: v_readlane_b32 s84, v18, 28
+; SI-NEXT: v_readlane_b32 s83, v18, 27
+; SI-NEXT: v_readlane_b32 s82, v18, 26
+; SI-NEXT: v_readlane_b32 s81, v18, 25
+; SI-NEXT: v_readlane_b32 s80, v18, 24
+; SI-NEXT: v_readlane_b32 s71, v18, 23
+; SI-NEXT: v_readlane_b32 s70, v18, 22
+; SI-NEXT: v_readlane_b32 s69, v18, 21
+; SI-NEXT: v_readlane_b32 s68, v18, 20
+; SI-NEXT: v_readlane_b32 s67, v18, 19
+; SI-NEXT: v_readlane_b32 s66, v18, 18
+; SI-NEXT: v_readlane_b32 s65, v18, 17
+; SI-NEXT: v_readlane_b32 s64, v18, 16
+; SI-NEXT: v_readlane_b32 s55, v18, 15
+; SI-NEXT: v_readlane_b32 s54, v18, 14
+; SI-NEXT: v_readlane_b32 s53, v18, 13
+; SI-NEXT: v_readlane_b32 s52, v18, 12
+; SI-NEXT: v_readlane_b32 s51, v18, 11
+; SI-NEXT: v_readlane_b32 s50, v18, 10
+; SI-NEXT: v_readlane_b32 s49, v18, 9
+; SI-NEXT: v_readlane_b32 s48, v18, 8
+; SI-NEXT: v_readlane_b32 s39, v18, 7
+; SI-NEXT: v_readlane_b32 s38, v18, 6
+; SI-NEXT: v_readlane_b32 s37, v18, 5
+; SI-NEXT: v_readlane_b32 s36, v18, 4
+; SI-NEXT: v_readlane_b32 s35, v18, 3
+; SI-NEXT: v_readlane_b32 s34, v18, 2
+; SI-NEXT: v_readlane_b32 s31, v18, 1
+; SI-NEXT: v_readlane_b32 s30, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -10141,43 +10266,71 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr18
; SI-NEXT: s_branch .LBB25_2
;
; VI-LABEL: bitcast_v16i32_to_v64i8_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v4, s30, 0
-; VI-NEXT: v_writelane_b32 v4, s31, 1
-; VI-NEXT: v_writelane_b32 v4, s34, 2
-; VI-NEXT: v_writelane_b32 v4, s35, 3
-; VI-NEXT: v_writelane_b32 v4, s36, 4
-; VI-NEXT: v_writelane_b32 v4, s37, 5
-; VI-NEXT: v_writelane_b32 v4, s38, 6
-; VI-NEXT: v_writelane_b32 v4, s39, 7
-; VI-NEXT: v_writelane_b32 v4, s48, 8
-; VI-NEXT: v_writelane_b32 v4, s49, 9
-; VI-NEXT: v_writelane_b32 v4, s50, 10
-; VI-NEXT: v_writelane_b32 v4, s51, 11
-; VI-NEXT: v_writelane_b32 v4, s52, 12
-; VI-NEXT: v_writelane_b32 v4, s53, 13
-; VI-NEXT: v_writelane_b32 v4, s54, 14
-; VI-NEXT: v_writelane_b32 v4, s55, 15
-; VI-NEXT: v_writelane_b32 v4, s64, 16
-; VI-NEXT: v_writelane_b32 v4, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s30, 0
+; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_writelane_b32 v18, s34, 2
+; VI-NEXT: v_writelane_b32 v18, s35, 3
+; VI-NEXT: v_writelane_b32 v18, s36, 4
+; VI-NEXT: v_writelane_b32 v18, s37, 5
+; VI-NEXT: v_writelane_b32 v18, s38, 6
+; VI-NEXT: v_writelane_b32 v18, s39, 7
+; VI-NEXT: v_writelane_b32 v18, s48, 8
+; VI-NEXT: v_writelane_b32 v18, s49, 9
+; VI-NEXT: v_writelane_b32 v18, s50, 10
+; VI-NEXT: v_writelane_b32 v18, s51, 11
+; VI-NEXT: v_writelane_b32 v18, s52, 12
+; VI-NEXT: v_writelane_b32 v18, s53, 13
+; VI-NEXT: v_writelane_b32 v18, s54, 14
+; VI-NEXT: v_writelane_b32 v18, s55, 15
+; VI-NEXT: v_writelane_b32 v18, s64, 16
+; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v4, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v4, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s67, 19
; VI-NEXT: s_cbranch_scc0 .LBB25_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -10185,287 +10338,287 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB25_3
; VI-NEXT: .LBB25_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s17, s17, 3
-; VI-NEXT: s_add_i32 s16, s16, 3
; VI-NEXT: s_add_i32 s19, s19, 3
; VI-NEXT: s_add_i32 s18, s18, 3
-; VI-NEXT: s_add_i32 s21, s21, 3
-; VI-NEXT: s_add_i32 s20, s20, 3
-; VI-NEXT: s_add_i32 s23, s23, 3
-; VI-NEXT: s_add_i32 s22, s22, 3
-; VI-NEXT: s_add_i32 s25, s25, 3
-; VI-NEXT: s_add_i32 s24, s24, 3
-; VI-NEXT: s_add_i32 s27, s27, 3
-; VI-NEXT: s_add_i32 s26, s26, 3
-; VI-NEXT: s_add_i32 s29, s29, 3
-; VI-NEXT: s_add_i32 s28, s28, 3
+; VI-NEXT: s_add_i32 s17, s17, 3
+; VI-NEXT: s_add_i32 s16, s16, 3
+; VI-NEXT: s_add_i32 s15, s15, 3
+; VI-NEXT: s_add_i32 s14, s14, 3
+; VI-NEXT: s_add_i32 s13, s13, 3
+; VI-NEXT: s_add_i32 s12, s12, 3
+; VI-NEXT: s_add_i32 s11, s11, 3
+; VI-NEXT: s_add_i32 s10, s10, 3
+; VI-NEXT: s_add_i32 s9, s9, 3
+; VI-NEXT: s_add_i32 s8, s8, 3
+; VI-NEXT: s_add_i32 s7, s7, 3
+; VI-NEXT: s_add_i32 s6, s6, 3
; VI-NEXT: s_add_i32 s5, s5, 3
; VI-NEXT: s_add_i32 s4, s4, 3
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_lshr_b32 s56, s5, 24
; VI-NEXT: s_lshr_b32 s57, s5, 16
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
; VI-NEXT: .LBB25_3: ; %end
-; VI-NEXT: s_and_b32 s7, s16, 0xff
-; VI-NEXT: s_lshl_b32 s9, s67, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s66, 0xff
-; VI-NEXT: s_lshl_b32 s11, s44, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_and_b32 s7, s17, 0xff
-; VI-NEXT: s_lshl_b32 s9, s65, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s64, 0xff
-; VI-NEXT: s_lshl_b32 s11, s55, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s18, 0xff
-; VI-NEXT: s_lshl_b32 s9, s54, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s53, 0xff
-; VI-NEXT: s_lshl_b32 s11, s42, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: s_and_b32 s18, s18, 0xff
+; VI-NEXT: s_lshl_b32 s21, s67, 8
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: s_and_b32 s21, s66, 0xff
+; VI-NEXT: s_lshl_b32 s23, s44, 8
+; VI-NEXT: s_or_b32 s21, s21, s23
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s21, s21, 16
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: v_mov_b32_e32 v1, s18
+; VI-NEXT: s_and_b32 s18, s19, 0xff
+; VI-NEXT: s_lshl_b32 s19, s65, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s19, s64, 0xff
+; VI-NEXT: s_lshl_b32 s21, s55, 8
+; VI-NEXT: s_or_b32 s19, s19, s21
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s19, s19, 16
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xff
+; VI-NEXT: s_lshl_b32 s18, s54, 8
+; VI-NEXT: s_or_b32 s16, s16, s18
+; VI-NEXT: s_and_b32 s18, s53, 0xff
+; VI-NEXT: s_lshl_b32 s19, s42, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s18, s18, 16
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s18
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s19, 0xff
-; VI-NEXT: s_lshl_b32 s9, s52, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s51, 0xff
-; VI-NEXT: s_lshl_b32 s11, s50, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s16, s17, 0xff
+; VI-NEXT: s_lshl_b32 s17, s52, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s17, s51, 0xff
+; VI-NEXT: s_lshl_b32 s18, s50, 8
+; VI-NEXT: s_or_b32 s17, s17, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s17, s17, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s17
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s20, 0xff
-; VI-NEXT: s_lshl_b32 s9, s49, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s48, 0xff
-; VI-NEXT: s_lshl_b32 s11, s40, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xff
+; VI-NEXT: s_lshl_b32 s16, s49, 8
+; VI-NEXT: s_or_b32 s14, s14, s16
+; VI-NEXT: s_and_b32 s16, s48, 0xff
+; VI-NEXT: s_lshl_b32 s17, s40, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s16, s16, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s16
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s21, 0xff
-; VI-NEXT: s_lshl_b32 s9, s39, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s38, 0xff
-; VI-NEXT: s_lshl_b32 s11, s37, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s14, s15, 0xff
+; VI-NEXT: s_lshl_b32 s15, s39, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s15, s38, 0xff
+; VI-NEXT: s_lshl_b32 s16, s37, 8
+; VI-NEXT: s_or_b32 s15, s15, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s15, s15, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s15
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s22, 0xff
-; VI-NEXT: s_lshl_b32 s9, s36, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s35, 0xff
-; VI-NEXT: s_lshl_b32 s11, s14, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xff
+; VI-NEXT: s_lshl_b32 s14, s36, 8
+; VI-NEXT: s_or_b32 s12, s12, s14
+; VI-NEXT: s_and_b32 s14, s35, 0xff
+; VI-NEXT: s_lshl_b32 s15, s28, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s14, s14, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s14
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s23, 0xff
-; VI-NEXT: s_lshl_b32 s9, s34, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s31, 0xff
-; VI-NEXT: s_lshl_b32 s11, s30, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s12, s13, 0xff
+; VI-NEXT: s_lshl_b32 s13, s34, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s13, s31, 0xff
+; VI-NEXT: s_lshl_b32 s14, s30, 8
+; VI-NEXT: s_or_b32 s13, s13, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s13, s13, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s13
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s24, 0xff
-; VI-NEXT: s_lshl_b32 s9, s91, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s90, 0xff
-; VI-NEXT: s_lshl_b32 s11, s12, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xff
+; VI-NEXT: s_lshl_b32 s12, s91, 8
+; VI-NEXT: s_or_b32 s10, s10, s12
+; VI-NEXT: s_and_b32 s12, s90, 0xff
+; VI-NEXT: s_lshl_b32 s13, s26, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s12, s12, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s12
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s25, 0xff
-; VI-NEXT: s_lshl_b32 s9, s89, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s88, 0xff
-; VI-NEXT: s_lshl_b32 s11, s79, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s10, s11, 0xff
+; VI-NEXT: s_lshl_b32 s11, s89, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s11, s88, 0xff
+; VI-NEXT: s_lshl_b32 s12, s79, 8
+; VI-NEXT: s_or_b32 s11, s11, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s11, s11, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s11
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s26, 0xff
-; VI-NEXT: s_lshl_b32 s9, s78, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s77, 0xff
-; VI-NEXT: s_lshl_b32 s10, s10, 8
-; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s8, s8, 0xff
+; VI-NEXT: s_lshl_b32 s10, s78, 8
+; VI-NEXT: s_or_b32 s8, s8, s10
+; VI-NEXT: s_and_b32 s10, s77, 0xff
+; VI-NEXT: s_lshl_b32 s11, s24, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_lshl_b32 s10, s10, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s10
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s27, 0xff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s8, s9, 0xff
; VI-NEXT: s_lshl_b32 s9, s76, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: s_and_b32 s9, s75, 0xff
; VI-NEXT: s_lshl_b32 s10, s74, 8
; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_lshl_b32 s9, s9, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s28, 0xff
-; VI-NEXT: s_lshl_b32 s9, s73, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s72, 0xff
-; VI-NEXT: s_lshl_b32 s8, s8, 8
-; VI-NEXT: s_or_b32 s8, s9, s8
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xff
+; VI-NEXT: s_lshl_b32 s8, s73, 8
+; VI-NEXT: s_or_b32 s6, s6, s8
+; VI-NEXT: s_and_b32 s8, s72, 0xff
+; VI-NEXT: s_lshl_b32 s9, s22, 8
+; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
; VI-NEXT: s_lshl_b32 s8, s8, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s29, 0xff
-; VI-NEXT: s_lshl_b32 s8, s63, 8
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_and_b32 s6, s7, 0xff
+; VI-NEXT: s_lshl_b32 s7, s63, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s62, 0xff
+; VI-NEXT: s_lshl_b32 s8, s61, 8
; VI-NEXT: s_or_b32 s7, s7, s8
-; VI-NEXT: s_and_b32 s8, s62, 0xff
-; VI-NEXT: s_lshl_b32 s9, s61, 8
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_and_b32 s4, s4, 0xff
-; VI-NEXT: s_lshl_b32 s7, s60, 8
-; VI-NEXT: s_or_b32 s4, s4, s7
-; VI-NEXT: s_and_b32 s7, s59, 0xff
-; VI-NEXT: s_lshl_b32 s6, s6, 8
-; VI-NEXT: s_or_b32 s6, s7, s6
+; VI-NEXT: s_lshl_b32 s6, s60, 8
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: s_and_b32 s6, s59, 0xff
+; VI-NEXT: s_lshl_b32 s7, s20, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0
@@ -10486,28 +10639,28 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v4, 19
-; VI-NEXT: v_readlane_b32 s66, v4, 18
-; VI-NEXT: v_readlane_b32 s65, v4, 17
-; VI-NEXT: v_readlane_b32 s64, v4, 16
-; VI-NEXT: v_readlane_b32 s55, v4, 15
-; VI-NEXT: v_readlane_b32 s54, v4, 14
-; VI-NEXT: v_readlane_b32 s53, v4, 13
-; VI-NEXT: v_readlane_b32 s52, v4, 12
-; VI-NEXT: v_readlane_b32 s51, v4, 11
-; VI-NEXT: v_readlane_b32 s50, v4, 10
-; VI-NEXT: v_readlane_b32 s49, v4, 9
-; VI-NEXT: v_readlane_b32 s48, v4, 8
-; VI-NEXT: v_readlane_b32 s39, v4, 7
-; VI-NEXT: v_readlane_b32 s38, v4, 6
-; VI-NEXT: v_readlane_b32 s37, v4, 5
-; VI-NEXT: v_readlane_b32 s36, v4, 4
-; VI-NEXT: v_readlane_b32 s35, v4, 3
-; VI-NEXT: v_readlane_b32 s34, v4, 2
-; VI-NEXT: v_readlane_b32 s31, v4, 1
-; VI-NEXT: v_readlane_b32 s30, v4, 0
+; VI-NEXT: v_readlane_b32 s67, v18, 19
+; VI-NEXT: v_readlane_b32 s66, v18, 18
+; VI-NEXT: v_readlane_b32 s65, v18, 17
+; VI-NEXT: v_readlane_b32 s64, v18, 16
+; VI-NEXT: v_readlane_b32 s55, v18, 15
+; VI-NEXT: v_readlane_b32 s54, v18, 14
+; VI-NEXT: v_readlane_b32 s53, v18, 13
+; VI-NEXT: v_readlane_b32 s52, v18, 12
+; VI-NEXT: v_readlane_b32 s51, v18, 11
+; VI-NEXT: v_readlane_b32 s50, v18, 10
+; VI-NEXT: v_readlane_b32 s49, v18, 9
+; VI-NEXT: v_readlane_b32 s48, v18, 8
+; VI-NEXT: v_readlane_b32 s39, v18, 7
+; VI-NEXT: v_readlane_b32 s38, v18, 6
+; VI-NEXT: v_readlane_b32 s37, v18, 5
+; VI-NEXT: v_readlane_b32 s36, v18, 4
+; VI-NEXT: v_readlane_b32 s35, v18, 3
+; VI-NEXT: v_readlane_b32 s34, v18, 2
+; VI-NEXT: v_readlane_b32 s31, v18, 1
+; VI-NEXT: v_readlane_b32 s30, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -10532,31 +10685,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr36
; VI-NEXT: ; implicit-def: $sgpr35
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr31
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr90
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr79
; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr73
; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: ; implicit-def: $sgpr62
; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: ; implicit-def: $sgpr60
; VI-NEXT: ; implicit-def: $sgpr59
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr20
; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: ; implicit-def: $sgpr57
; VI-NEXT: ; implicit-def: $sgpr56
@@ -10566,28 +10719,56 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v4, s30, 0
-; GFX9-NEXT: v_writelane_b32 v4, s31, 1
-; GFX9-NEXT: v_writelane_b32 v4, s34, 2
-; GFX9-NEXT: v_writelane_b32 v4, s35, 3
-; GFX9-NEXT: v_writelane_b32 v4, s36, 4
-; GFX9-NEXT: v_writelane_b32 v4, s37, 5
-; GFX9-NEXT: v_writelane_b32 v4, s38, 6
-; GFX9-NEXT: v_writelane_b32 v4, s39, 7
-; GFX9-NEXT: v_writelane_b32 v4, s48, 8
-; GFX9-NEXT: v_writelane_b32 v4, s49, 9
-; GFX9-NEXT: v_writelane_b32 v4, s50, 10
-; GFX9-NEXT: v_writelane_b32 v4, s51, 11
-; GFX9-NEXT: v_writelane_b32 v4, s52, 12
-; GFX9-NEXT: v_writelane_b32 v4, s53, 13
+; GFX9-NEXT: v_writelane_b32 v18, s30, 0
+; GFX9-NEXT: v_writelane_b32 v18, s31, 1
+; GFX9-NEXT: v_writelane_b32 v18, s34, 2
+; GFX9-NEXT: v_writelane_b32 v18, s35, 3
+; GFX9-NEXT: v_writelane_b32 v18, s36, 4
+; GFX9-NEXT: v_writelane_b32 v18, s37, 5
+; GFX9-NEXT: v_writelane_b32 v18, s38, 6
+; GFX9-NEXT: v_writelane_b32 v18, s39, 7
+; GFX9-NEXT: v_writelane_b32 v18, s48, 8
+; GFX9-NEXT: v_writelane_b32 v18, s49, 9
+; GFX9-NEXT: v_writelane_b32 v18, s50, 10
+; GFX9-NEXT: v_writelane_b32 v18, s51, 11
+; GFX9-NEXT: v_writelane_b32 v18, s52, 12
+; GFX9-NEXT: v_writelane_b32 v18, s53, 13
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v4, s54, 14
+; GFX9-NEXT: v_writelane_b32 v18, s54, 14
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_writelane_b32 v4, s55, 15
+; GFX9-NEXT: v_writelane_b32 v18, s55, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB25_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -10595,275 +10776,275 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: s_lshr_b32 s59, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s63, s29, 8
-; GFX9-NEXT: s_lshr_b32 s72, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s76, s27, 8
-; GFX9-NEXT: s_lshr_b32 s77, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s89, s25, 8
-; GFX9-NEXT: s_lshr_b32 s90, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s94, s23, 8
-; GFX9-NEXT: s_lshr_b32 s95, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s35, s21, 8
-; GFX9-NEXT: s_lshr_b32 s36, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s48, s19, 8
-; GFX9-NEXT: s_lshr_b32 s49, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s53, s17, 8
-; GFX9-NEXT: s_lshr_b32 s54, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s63, s7, 8
+; GFX9-NEXT: s_lshr_b32 s72, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s9, 8
+; GFX9-NEXT: s_lshr_b32 s77, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s89, s11, 8
+; GFX9-NEXT: s_lshr_b32 s90, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s94, s13, 8
+; GFX9-NEXT: s_lshr_b32 s95, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s35, s15, 8
+; GFX9-NEXT: s_lshr_b32 s36, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s48, s17, 8
+; GFX9-NEXT: s_lshr_b32 s49, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s53, s19, 8
+; GFX9-NEXT: s_lshr_b32 s54, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB25_3
; GFX9-NEXT: .LBB25_2: ; %cmp.true
-; GFX9-NEXT: s_add_i32 s17, s17, 3
-; GFX9-NEXT: s_add_i32 s16, s16, 3
; GFX9-NEXT: s_add_i32 s19, s19, 3
; GFX9-NEXT: s_add_i32 s18, s18, 3
-; GFX9-NEXT: s_add_i32 s21, s21, 3
-; GFX9-NEXT: s_add_i32 s20, s20, 3
-; GFX9-NEXT: s_add_i32 s23, s23, 3
-; GFX9-NEXT: s_add_i32 s22, s22, 3
-; GFX9-NEXT: s_add_i32 s25, s25, 3
-; GFX9-NEXT: s_add_i32 s24, s24, 3
-; GFX9-NEXT: s_add_i32 s27, s27, 3
-; GFX9-NEXT: s_add_i32 s26, s26, 3
-; GFX9-NEXT: s_add_i32 s29, s29, 3
-; GFX9-NEXT: s_add_i32 s28, s28, 3
+; GFX9-NEXT: s_add_i32 s17, s17, 3
+; GFX9-NEXT: s_add_i32 s16, s16, 3
+; GFX9-NEXT: s_add_i32 s15, s15, 3
+; GFX9-NEXT: s_add_i32 s14, s14, 3
+; GFX9-NEXT: s_add_i32 s13, s13, 3
+; GFX9-NEXT: s_add_i32 s12, s12, 3
+; GFX9-NEXT: s_add_i32 s11, s11, 3
+; GFX9-NEXT: s_add_i32 s10, s10, 3
+; GFX9-NEXT: s_add_i32 s9, s9, 3
+; GFX9-NEXT: s_add_i32 s8, s8, 3
+; GFX9-NEXT: s_add_i32 s7, s7, 3
+; GFX9-NEXT: s_add_i32 s6, s6, 3
; GFX9-NEXT: s_add_i32 s5, s5, 3
; GFX9-NEXT: s_add_i32 s4, s4, 3
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
; GFX9-NEXT: s_lshr_b32 s57, s5, 16
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: s_lshr_b32 s59, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s63, s29, 8
-; GFX9-NEXT: s_lshr_b32 s72, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s76, s27, 8
-; GFX9-NEXT: s_lshr_b32 s77, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s89, s25, 8
-; GFX9-NEXT: s_lshr_b32 s90, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s94, s23, 8
-; GFX9-NEXT: s_lshr_b32 s95, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s35, s21, 8
-; GFX9-NEXT: s_lshr_b32 s36, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s48, s19, 8
-; GFX9-NEXT: s_lshr_b32 s49, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s53, s17, 8
-; GFX9-NEXT: s_lshr_b32 s54, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s63, s7, 8
+; GFX9-NEXT: s_lshr_b32 s72, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s9, 8
+; GFX9-NEXT: s_lshr_b32 s77, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s89, s11, 8
+; GFX9-NEXT: s_lshr_b32 s90, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s94, s13, 8
+; GFX9-NEXT: s_lshr_b32 s95, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s35, s15, 8
+; GFX9-NEXT: s_lshr_b32 s36, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s48, s17, 8
+; GFX9-NEXT: s_lshr_b32 s49, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s53, s19, 8
+; GFX9-NEXT: s_lshr_b32 s54, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
; GFX9-NEXT: .LBB25_3: ; %end
-; GFX9-NEXT: s_and_b32 s7, s16, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s55, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s54, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s44, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s17, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s53, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s52, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s51, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s18, s18, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s55, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: s_and_b32 s21, s54, 0xff
+; GFX9-NEXT: s_lshl_b32 s23, s44, 8
+; GFX9-NEXT: s_or_b32 s21, s21, s23
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s21, s21, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s18, s19, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s53, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s19, s52, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s51, 8
+; GFX9-NEXT: s_or_b32 s19, s19, s21
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s19, s19, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s19
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s18, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s50, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s49, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s42, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s50, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s18
+; GFX9-NEXT: s_and_b32 s18, s49, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s42, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s18, s18, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s19, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s48, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s39, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s38, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s16, s17, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s48, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s17, s39, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s38, 8
+; GFX9-NEXT: s_or_b32 s17, s17, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s17, s17, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s17
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s20, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s37, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s36, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s40, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s37, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s16
+; GFX9-NEXT: s_and_b32 s16, s36, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s40, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s16, s16, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s16
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s21, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s35, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s34, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s31, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s14, s15, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s35, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s15, s34, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s31, 8
+; GFX9-NEXT: s_or_b32 s15, s15, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s15, s15, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s15
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s22, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s30, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s95, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s14, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s30, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s14
+; GFX9-NEXT: s_and_b32 s14, s95, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s28, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s14, s14, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s14
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s23, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s94, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s93, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s92, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s12, s13, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s94, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s13, s93, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s92, 8
+; GFX9-NEXT: s_or_b32 s13, s13, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s13, s13, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s13
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s24, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s91, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s90, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s12, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s91, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s12
+; GFX9-NEXT: s_and_b32 s12, s90, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s26, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s12, s12, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s12
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s25, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s89, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s88, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s79, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s10, s11, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s89, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s11, s88, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s79, 8
+; GFX9-NEXT: s_or_b32 s11, s11, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s11, s11, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s11
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s26, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s78, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s10, s10, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s8, s8, 0xff
+; GFX9-NEXT: s_lshl_b32 s10, s78, 8
+; GFX9-NEXT: s_or_b32 s8, s8, s10
+; GFX9-NEXT: s_and_b32 s10, s77, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s24, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX9-NEXT: s_lshl_b32 s10, s10, 16
+; GFX9-NEXT: s_or_b32 s8, s8, s10
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s27, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s8, s9, 0xff
; GFX9-NEXT: s_lshl_b32 s9, s76, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: s_and_b32 s9, s75, 0xff
; GFX9-NEXT: s_lshl_b32 s10, s74, 8
; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s28, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s73, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s72, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s8, s9, s8
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s29, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s63, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: s_and_b32 s8, s62, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s61, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s73, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: s_and_b32 s8, s72, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s22, 8
; GFX9-NEXT: s_or_b32 s8, s8, s9
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s7, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s63, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s62, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s61, 8
; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s60, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s7
-; GFX9-NEXT: s_and_b32 s7, s59, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s6, s7, s6
+; GFX9-NEXT: s_lshl_b32 s6, s60, 8
+; GFX9-NEXT: s_or_b32 s4, s4, s6
+; GFX9-NEXT: s_and_b32 s6, s59, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s20, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
@@ -10881,24 +11062,24 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_readlane_b32 s55, v4, 15
-; GFX9-NEXT: v_readlane_b32 s54, v4, 14
-; GFX9-NEXT: v_readlane_b32 s53, v4, 13
-; GFX9-NEXT: v_readlane_b32 s52, v4, 12
-; GFX9-NEXT: v_readlane_b32 s51, v4, 11
-; GFX9-NEXT: v_readlane_b32 s50, v4, 10
-; GFX9-NEXT: v_readlane_b32 s49, v4, 9
-; GFX9-NEXT: v_readlane_b32 s48, v4, 8
-; GFX9-NEXT: v_readlane_b32 s39, v4, 7
-; GFX9-NEXT: v_readlane_b32 s38, v4, 6
-; GFX9-NEXT: v_readlane_b32 s37, v4, 5
-; GFX9-NEXT: v_readlane_b32 s36, v4, 4
-; GFX9-NEXT: v_readlane_b32 s35, v4, 3
-; GFX9-NEXT: v_readlane_b32 s34, v4, 2
-; GFX9-NEXT: v_readlane_b32 s31, v4, 1
-; GFX9-NEXT: v_readlane_b32 s30, v4, 0
+; GFX9-NEXT: v_readlane_b32 s55, v18, 15
+; GFX9-NEXT: v_readlane_b32 s54, v18, 14
+; GFX9-NEXT: v_readlane_b32 s53, v18, 13
+; GFX9-NEXT: v_readlane_b32 s52, v18, 12
+; GFX9-NEXT: v_readlane_b32 s51, v18, 11
+; GFX9-NEXT: v_readlane_b32 s50, v18, 10
+; GFX9-NEXT: v_readlane_b32 s49, v18, 9
+; GFX9-NEXT: v_readlane_b32 s48, v18, 8
+; GFX9-NEXT: v_readlane_b32 s39, v18, 7
+; GFX9-NEXT: v_readlane_b32 s38, v18, 6
+; GFX9-NEXT: v_readlane_b32 s37, v18, 5
+; GFX9-NEXT: v_readlane_b32 s36, v18, 4
+; GFX9-NEXT: v_readlane_b32 s35, v18, 3
+; GFX9-NEXT: v_readlane_b32 s34, v18, 2
+; GFX9-NEXT: v_readlane_b32 s31, v18, 1
+; GFX9-NEXT: v_readlane_b32 s30, v18, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10923,31 +11104,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $sgpr31
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr95
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr94
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr90
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr89
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
; GFX9-NEXT: ; implicit-def: $sgpr78
; GFX9-NEXT: ; implicit-def: $sgpr77
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr76
; GFX9-NEXT: ; implicit-def: $sgpr75
; GFX9-NEXT: ; implicit-def: $sgpr74
; GFX9-NEXT: ; implicit-def: $sgpr73
; GFX9-NEXT: ; implicit-def: $sgpr72
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr63
; GFX9-NEXT: ; implicit-def: $sgpr62
; GFX9-NEXT: ; implicit-def: $sgpr61
; GFX9-NEXT: ; implicit-def: $sgpr60
; GFX9-NEXT: ; implicit-def: $sgpr59
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr58
; GFX9-NEXT: ; implicit-def: $sgpr57
; GFX9-NEXT: ; implicit-def: $sgpr56
@@ -17793,111 +17974,139 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
; VI-LABEL: bitcast_v32i16_to_v16f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s6, v3
+; VI-NEXT: v_readfirstlane_b32 s7, v4
+; VI-NEXT: v_readfirstlane_b32 s8, v5
+; VI-NEXT: v_readfirstlane_b32 s9, v6
+; VI-NEXT: v_readfirstlane_b32 s10, v7
+; VI-NEXT: v_readfirstlane_b32 s11, v8
+; VI-NEXT: v_readfirstlane_b32 s12, v9
+; VI-NEXT: v_readfirstlane_b32 s13, v10
+; VI-NEXT: v_readfirstlane_b32 s14, v11
+; VI-NEXT: v_readfirstlane_b32 s15, v12
+; VI-NEXT: v_readfirstlane_b32 s16, v13
+; VI-NEXT: v_readfirstlane_b32 s17, v14
+; VI-NEXT: v_readfirstlane_b32 s18, v15
+; VI-NEXT: v_readfirstlane_b32 s19, v16
+; VI-NEXT: v_readfirstlane_b32 s20, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v1
; VI-NEXT: s_cbranch_scc0 .LBB39_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB39_3
; VI-NEXT: .LBB39_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s5, s7, 3
-; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s6, 3
-; VI-NEXT: s_add_i32 s7, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s20, 3
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s29, 3
-; VI-NEXT: s_add_i32 s6, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s19, 3
+; VI-NEXT: s_add_i32 s20, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s28, 3
-; VI-NEXT: s_add_i32 s29, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s18, 3
+; VI-NEXT: s_add_i32 s19, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s27, 3
-; VI-NEXT: s_add_i32 s28, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s17, 3
+; VI-NEXT: s_add_i32 s18, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s26, 3
-; VI-NEXT: s_add_i32 s27, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s16, 3
+; VI-NEXT: s_add_i32 s17, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s25, 3
-; VI-NEXT: s_add_i32 s26, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s15, 3
+; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s15, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s24, 3
-; VI-NEXT: s_add_i32 s25, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s14, 3
+; VI-NEXT: s_add_i32 s15, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s14, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s23, 3
-; VI-NEXT: s_add_i32 s24, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s13, 3
+; VI-NEXT: s_add_i32 s14, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s13, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s22, 3
-; VI-NEXT: s_add_i32 s23, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s12, 3
+; VI-NEXT: s_add_i32 s13, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s12, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s21, 3
-; VI-NEXT: s_add_i32 s22, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s11, 3
+; VI-NEXT: s_add_i32 s12, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s11, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s20, 3
-; VI-NEXT: s_add_i32 s21, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s10, 3
+; VI-NEXT: s_add_i32 s11, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s10, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s19, 3
-; VI-NEXT: s_add_i32 s20, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s9, 3
+; VI-NEXT: s_add_i32 s10, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s9, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s18, 3
-; VI-NEXT: s_add_i32 s19, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s8, 3
+; VI-NEXT: s_add_i32 s9, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s8, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s17, 3
-; VI-NEXT: s_add_i32 s18, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s7, 3
+; VI-NEXT: s_add_i32 s8, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_add_i32 s17, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s6, 3
+; VI-NEXT: s_add_i32 s7, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s6, s4, 0x30000
; VI-NEXT: .LBB39_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_mov_b32_e32 v7, s13
+; VI-NEXT: v_mov_b32_e32 v8, s14
+; VI-NEXT: v_mov_b32_e32 v9, s15
+; VI-NEXT: v_mov_b32_e32 v10, s16
+; VI-NEXT: v_mov_b32_e32 v11, s17
+; VI-NEXT: v_mov_b32_e32 v12, s18
+; VI-NEXT: v_mov_b32_e32 v13, s19
+; VI-NEXT: v_mov_b32_e32 v14, s20
+; VI-NEXT: v_mov_b32_e32 v15, s21
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB39_4:
; VI-NEXT: s_branch .LBB39_2
@@ -18307,79 +18516,91 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_readfirstlane_b32 s6, v0
+; SI-NEXT: v_mov_b32_e32 v55, s16
+; SI-NEXT: v_mov_b32_e32 v54, s17
+; SI-NEXT: v_mov_b32_e32 v53, s18
+; SI-NEXT: v_mov_b32_e32 v52, s19
+; SI-NEXT: v_mov_b32_e32 v51, s20
+; SI-NEXT: v_mov_b32_e32 v50, s21
+; SI-NEXT: v_mov_b32_e32 v49, s22
+; SI-NEXT: v_mov_b32_e32 v48, s23
+; SI-NEXT: v_mov_b32_e32 v39, s24
+; SI-NEXT: v_mov_b32_e32 v38, s25
+; SI-NEXT: v_mov_b32_e32 v36, s26
+; SI-NEXT: v_mov_b32_e32 v35, s27
+; SI-NEXT: v_mov_b32_e32 v34, s28
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s7, v1
+; SI-NEXT: v_mov_b32_e32 v37, s29
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s4
-; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT: s_lshr_b32 s4, s29, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT: s_lshr_b32 s4, s28, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT: s_lshr_b32 s4, s27, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT: s_lshr_b32 s4, s26, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT: s_lshr_b32 s4, s25, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT: s_lshr_b32 s4, s24, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s4
-; SI-NEXT: s_lshr_b32 s4, s23, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT: s_lshr_b32 s4, s22, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s4
-; SI-NEXT: s_lshr_b32 s4, s21, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT: s_lshr_b32 s4, s20, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v9, s4
-; SI-NEXT: s_lshr_b32 s4, s19, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
-; SI-NEXT: s_lshr_b32 s4, s18, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT: s_lshr_b32 s4, s17, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: s_lshr_b32 s4, s16, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v28, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v55
; SI-NEXT: s_cbranch_execnz .LBB41_3
; SI-NEXT: .LBB41_2: ; %cmp.true
-; SI-NEXT: v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT: v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT: v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT: v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT: v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT: v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT: v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT: v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT: v_add_f32_e64 v18, s25, 1.0
-; SI-NEXT: v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT: v_add_f32_e64 v22, s27, 1.0
-; SI-NEXT: v_add_f32_e64 v24, s28, 1.0
-; SI-NEXT: v_add_f32_e64 v26, s29, 1.0
-; SI-NEXT: v_add_f32_e64 v28, s6, 1.0
-; SI-NEXT: v_add_f32_e64 v30, s7, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_add_f32_e32 v3, 1.0, v55
+; SI-NEXT: v_add_f32_e32 v2, 1.0, v54
+; SI-NEXT: v_add_f32_e32 v4, 1.0, v53
+; SI-NEXT: v_add_f32_e32 v6, 1.0, v52
+; SI-NEXT: v_add_f32_e32 v8, 1.0, v51
+; SI-NEXT: v_add_f32_e32 v10, 1.0, v50
+; SI-NEXT: v_add_f32_e32 v12, 1.0, v49
+; SI-NEXT: v_add_f32_e32 v14, 1.0, v48
+; SI-NEXT: v_add_f32_e32 v16, 1.0, v39
+; SI-NEXT: v_add_f32_e32 v18, 1.0, v38
+; SI-NEXT: v_add_f32_e32 v20, 1.0, v36
+; SI-NEXT: v_add_f32_e32 v22, 1.0, v35
+; SI-NEXT: v_add_f32_e32 v24, 1.0, v34
+; SI-NEXT: v_add_f32_e32 v26, 1.0, v37
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
@@ -18392,10 +18613,10 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v0
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
@@ -18409,7 +18630,7 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v3
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
@@ -18424,13 +18645,15 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
; SI-NEXT: .LBB41_3: ; %end
+; SI-NEXT: v_mov_b32_e32 v0, v32
+; SI-NEXT: v_mov_b32_e32 v1, v33
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB41_4:
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
@@ -21380,172 +21603,209 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16
+; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41
+; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61
+; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59
+; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58
+; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57
+; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56
+; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47
+; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46
+; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45
+; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44
+; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
+; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42
+; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16
+; SI-NEXT: v_mov_b32_e32 v18, v20
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
; SI-NEXT: .LBB47_3: ; %end
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -21554,687 +21814,665 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v19, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v19, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v10, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v7, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v9, s25
+; VI-NEXT: v_mov_b32_e32 v11, s27
+; VI-NEXT: v_mov_b32_e32 v13, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB47_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB47_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB47_4
+; VI-NEXT: s_cbranch_execnz .LBB47_3
; VI-NEXT: .LBB47_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v0
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v0
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19]
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20]
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20]
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21]
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22]
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22]
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23]
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24]
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24]
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
-; VI-NEXT: s_branch .LBB47_5
-; VI-NEXT: .LBB47_3:
-; VI-NEXT: s_branch .LBB47_2
-; VI-NEXT: .LBB47_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB47_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v19, 1
-; VI-NEXT: v_readlane_b32 s30, v19, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25]
+; VI-NEXT: v_mov_b32_e32 v1, v23
+; VI-NEXT: v_mov_b32_e32 v3, v22
+; VI-NEXT: v_mov_b32_e32 v5, v21
+; VI-NEXT: v_mov_b32_e32 v7, v20
+; VI-NEXT: v_mov_b32_e32 v9, v19
+; VI-NEXT: v_mov_b32_e32 v11, v18
+; VI-NEXT: v_mov_b32_e32 v13, v17
+; VI-NEXT: v_mov_b32_e32 v15, v16
+; VI-NEXT: .LBB47_3: ; %end
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB47_4:
+; VI-NEXT: s_branch .LBB47_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v16f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s16
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-NEXT: v_mov_b32_e32 v4, s20
+; GFX9-NEXT: v_mov_b32_e32 v5, s21
+; GFX9-NEXT: v_mov_b32_e32 v6, s22
+; GFX9-NEXT: v_mov_b32_e32 v7, s23
+; GFX9-NEXT: v_mov_b32_e32 v8, s24
+; GFX9-NEXT: v_mov_b32_e32 v9, s25
+; GFX9-NEXT: v_mov_b32_e32 v10, s26
+; GFX9-NEXT: v_mov_b32_e32 v11, s27
+; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX9-NEXT: v_mov_b32_e32 v13, s29
+; GFX9-NEXT: s_cbranch_scc0 .LBB47_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB47_4
+; GFX9-NEXT: s_cbranch_execnz .LBB47_3
; GFX9-NEXT: .LBB47_2: ; %cmp.true
-; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s4, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s4, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v16, v16, v15
+; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc
; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16
-; GFX9-NEXT: s_branch .LBB47_5
-; GFX9-NEXT: .LBB47_3:
-; GFX9-NEXT: s_branch .LBB47_2
-; GFX9-NEXT: .LBB47_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: v_mov_b32_e32 v4, s20
-; GFX9-NEXT: v_mov_b32_e32 v5, s21
-; GFX9-NEXT: v_mov_b32_e32 v6, s22
-; GFX9-NEXT: v_mov_b32_e32 v7, s23
-; GFX9-NEXT: v_mov_b32_e32 v8, s24
-; GFX9-NEXT: v_mov_b32_e32 v9, s25
-; GFX9-NEXT: v_mov_b32_e32 v10, s26
-; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: v_mov_b32_e32 v12, s28
-; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB47_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0
+; GFX9-NEXT: .LBB47_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB47_4:
+; GFX9-NEXT: s_branch .LBB47_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -23309,22 +23547,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -23360,6 +23582,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23642,22 +23880,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -23691,6 +23913,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23962,25 +24200,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24007,6 +24229,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -24604,92 +24842,120 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_writelane_b32 v40, s81, 25
; SI-NEXT: v_writelane_b32 v40, s82, 26
; SI-NEXT: v_writelane_b32 v40, s83, 27
+; SI-NEXT: v_mov_b32_e32 v4, s16
+; SI-NEXT: v_mov_b32_e32 v5, s17
+; SI-NEXT: v_mov_b32_e32 v6, s18
+; SI-NEXT: v_mov_b32_e32 v7, s19
+; SI-NEXT: v_mov_b32_e32 v8, s20
+; SI-NEXT: v_mov_b32_e32 v9, s21
+; SI-NEXT: v_mov_b32_e32 v10, s22
+; SI-NEXT: v_mov_b32_e32 v11, s23
+; SI-NEXT: v_mov_b32_e32 v12, s24
+; SI-NEXT: v_mov_b32_e32 v13, s25
+; SI-NEXT: v_mov_b32_e32 v14, s26
+; SI-NEXT: v_mov_b32_e32 v15, s27
+; SI-NEXT: v_mov_b32_e32 v16, s28
+; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_readfirstlane_b32 s36, v1
+; SI-NEXT: v_readfirstlane_b32 s36, v4
+; SI-NEXT: v_readfirstlane_b32 s37, v5
+; SI-NEXT: v_readfirstlane_b32 s34, v6
+; SI-NEXT: v_readfirstlane_b32 s35, v7
+; SI-NEXT: v_readfirstlane_b32 s30, v8
+; SI-NEXT: v_readfirstlane_b32 s31, v9
+; SI-NEXT: v_readfirstlane_b32 s94, v10
+; SI-NEXT: v_readfirstlane_b32 s95, v11
+; SI-NEXT: v_readfirstlane_b32 s92, v12
+; SI-NEXT: v_readfirstlane_b32 s93, v13
+; SI-NEXT: v_readfirstlane_b32 s90, v14
+; SI-NEXT: v_readfirstlane_b32 s91, v15
+; SI-NEXT: v_readfirstlane_b32 s88, v16
+; SI-NEXT: v_readfirstlane_b32 s89, v17
+; SI-NEXT: v_readfirstlane_b32 s78, v1
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s37, v2
+; SI-NEXT: v_readfirstlane_b32 s79, v2
; SI-NEXT: v_writelane_b32 v40, s85, 29
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s82, s37, 24
-; SI-NEXT: s_lshr_b32 s84, s37, 16
-; SI-NEXT: s_lshr_b32 s85, s37, 8
-; SI-NEXT: s_lshr_b32 s71, s29, 24
-; SI-NEXT: s_lshr_b32 s81, s29, 16
-; SI-NEXT: s_lshr_b32 s83, s29, 8
-; SI-NEXT: s_lshr_b32 s68, s27, 24
-; SI-NEXT: s_lshr_b32 s70, s27, 16
-; SI-NEXT: s_lshr_b32 s80, s27, 8
-; SI-NEXT: s_lshr_b32 s65, s25, 24
-; SI-NEXT: s_lshr_b32 s67, s25, 16
-; SI-NEXT: s_lshr_b32 s69, s25, 8
-; SI-NEXT: s_lshr_b32 s54, s23, 24
-; SI-NEXT: s_lshr_b32 s64, s23, 16
-; SI-NEXT: s_lshr_b32 s66, s23, 8
-; SI-NEXT: s_lshr_b32 s51, s21, 24
-; SI-NEXT: s_lshr_b32 s53, s21, 16
-; SI-NEXT: s_lshr_b32 s55, s21, 8
-; SI-NEXT: s_lshr_b32 s48, s19, 24
-; SI-NEXT: s_lshr_b32 s50, s19, 16
-; SI-NEXT: s_lshr_b32 s52, s19, 8
-; SI-NEXT: s_lshr_b32 s38, s17, 24
-; SI-NEXT: s_lshr_b32 s39, s17, 16
-; SI-NEXT: s_lshr_b32 s49, s17, 8
-; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24
-; SI-NEXT: s_lshr_b64 s[6:7], s[36:37], 16
-; SI-NEXT: s_lshr_b64 s[8:9], s[36:37], 8
-; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s82, s79, 24
+; SI-NEXT: s_lshr_b32 s84, s79, 16
+; SI-NEXT: s_lshr_b32 s85, s79, 8
+; SI-NEXT: s_lshr_b32 s71, s89, 24
+; SI-NEXT: s_lshr_b32 s81, s89, 16
+; SI-NEXT: s_lshr_b32 s83, s89, 8
+; SI-NEXT: s_lshr_b32 s68, s91, 24
+; SI-NEXT: s_lshr_b32 s70, s91, 16
+; SI-NEXT: s_lshr_b32 s80, s91, 8
+; SI-NEXT: s_lshr_b32 s65, s93, 24
+; SI-NEXT: s_lshr_b32 s67, s93, 16
+; SI-NEXT: s_lshr_b32 s69, s93, 8
+; SI-NEXT: s_lshr_b32 s54, s95, 24
+; SI-NEXT: s_lshr_b32 s64, s95, 16
+; SI-NEXT: s_lshr_b32 s66, s95, 8
+; SI-NEXT: s_lshr_b32 s51, s31, 24
+; SI-NEXT: s_lshr_b32 s53, s31, 16
+; SI-NEXT: s_lshr_b32 s55, s31, 8
+; SI-NEXT: s_lshr_b32 s48, s35, 24
+; SI-NEXT: s_lshr_b32 s50, s35, 16
+; SI-NEXT: s_lshr_b32 s52, s35, 8
+; SI-NEXT: s_lshr_b32 s38, s37, 24
+; SI-NEXT: s_lshr_b32 s39, s37, 16
+; SI-NEXT: s_lshr_b32 s49, s37, 8
+; SI-NEXT: s_lshr_b64 s[4:5], s[78:79], 24
+; SI-NEXT: s_lshr_b64 s[6:7], s[78:79], 16
+; SI-NEXT: s_lshr_b64 s[8:9], s[78:79], 8
+; SI-NEXT: s_lshr_b64 s[10:11], s[88:89], 24
+; SI-NEXT: s_lshr_b64 s[12:13], s[88:89], 16
+; SI-NEXT: s_lshr_b64 s[14:15], s[88:89], 8
+; SI-NEXT: s_lshr_b64 s[16:17], s[90:91], 24
+; SI-NEXT: s_lshr_b64 s[18:19], s[90:91], 16
+; SI-NEXT: s_lshr_b64 s[20:21], s[90:91], 8
+; SI-NEXT: s_lshr_b64 s[22:23], s[92:93], 24
+; SI-NEXT: s_lshr_b64 s[24:25], s[92:93], 16
+; SI-NEXT: s_lshr_b64 s[26:27], s[92:93], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[94:95], 24
+; SI-NEXT: s_lshr_b64 s[46:47], s[94:95], 16
+; SI-NEXT: s_lshr_b64 s[56:57], s[94:95], 8
+; SI-NEXT: s_lshr_b64 s[28:29], s[30:31], 24
+; SI-NEXT: s_lshr_b64 s[40:41], s[30:31], 16
+; SI-NEXT: s_lshr_b64 s[44:45], s[30:31], 8
+; SI-NEXT: s_lshr_b64 s[58:59], s[34:35], 24
+; SI-NEXT: s_lshr_b64 s[60:61], s[34:35], 16
+; SI-NEXT: s_lshr_b64 s[62:63], s[34:35], 8
+; SI-NEXT: s_lshr_b64 s[72:73], s[36:37], 24
+; SI-NEXT: s_lshr_b64 s[74:75], s[36:37], 16
+; SI-NEXT: s_lshr_b64 s[76:77], s[36:37], 8
; SI-NEXT: s_cbranch_execnz .LBB49_4
; SI-NEXT: .LBB49_2: ; %cmp.true
-; SI-NEXT: v_add_f32_e64 v20, s17, 1.0
-; SI-NEXT: v_add_f32_e64 v22, s16, 1.0
-; SI-NEXT: v_add_f32_e64 v16, s19, 1.0
-; SI-NEXT: v_add_f32_e64 v18, s18, 1.0
-; SI-NEXT: v_add_f32_e64 v11, s21, 1.0
-; SI-NEXT: v_add_f32_e64 v15, s20, 1.0
-; SI-NEXT: v_add_f32_e64 v9, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v10, s22, 1.0
-; SI-NEXT: v_add_f32_e64 v7, s25, 1.0
-; SI-NEXT: v_add_f32_e64 v8, s24, 1.0
-; SI-NEXT: v_add_f32_e64 v5, s27, 1.0
-; SI-NEXT: v_add_f32_e64 v6, s26, 1.0
-; SI-NEXT: v_add_f32_e64 v3, s29, 1.0
-; SI-NEXT: v_add_f32_e64 v4, s28, 1.0
-; SI-NEXT: v_add_f32_e64 v1, s37, 1.0
-; SI-NEXT: v_add_f32_e64 v2, s36, 1.0
-; SI-NEXT: v_readfirstlane_b32 s16, v22
-; SI-NEXT: v_readfirstlane_b32 s17, v20
-; SI-NEXT: v_readfirstlane_b32 s18, v18
-; SI-NEXT: v_readfirstlane_b32 s19, v16
-; SI-NEXT: v_readfirstlane_b32 s20, v15
-; SI-NEXT: v_readfirstlane_b32 s21, v11
-; SI-NEXT: v_readfirstlane_b32 s22, v10
-; SI-NEXT: v_readfirstlane_b32 s23, v9
-; SI-NEXT: v_readfirstlane_b32 s24, v8
-; SI-NEXT: v_readfirstlane_b32 s25, v7
-; SI-NEXT: v_readfirstlane_b32 s26, v6
-; SI-NEXT: v_readfirstlane_b32 s27, v5
+; SI-NEXT: v_add_f32_e64 v20, s37, 1.0
+; SI-NEXT: v_add_f32_e64 v22, s36, 1.0
+; SI-NEXT: v_add_f32_e64 v16, s35, 1.0
+; SI-NEXT: v_add_f32_e64 v18, s34, 1.0
+; SI-NEXT: v_add_f32_e64 v11, s31, 1.0
+; SI-NEXT: v_add_f32_e64 v15, s30, 1.0
+; SI-NEXT: v_add_f32_e64 v9, s95, 1.0
+; SI-NEXT: v_add_f32_e64 v10, s94, 1.0
+; SI-NEXT: v_add_f32_e64 v7, s93, 1.0
+; SI-NEXT: v_add_f32_e64 v8, s92, 1.0
+; SI-NEXT: v_add_f32_e64 v5, s91, 1.0
+; SI-NEXT: v_add_f32_e64 v6, s90, 1.0
+; SI-NEXT: v_add_f32_e64 v3, s89, 1.0
+; SI-NEXT: v_add_f32_e64 v4, s88, 1.0
+; SI-NEXT: v_add_f32_e64 v1, s79, 1.0
+; SI-NEXT: v_add_f32_e64 v2, s78, 1.0
+; SI-NEXT: v_readfirstlane_b32 s76, v22
+; SI-NEXT: v_readfirstlane_b32 s77, v20
+; SI-NEXT: v_readfirstlane_b32 s62, v18
+; SI-NEXT: v_readfirstlane_b32 s63, v16
+; SI-NEXT: v_readfirstlane_b32 s44, v15
+; SI-NEXT: v_readfirstlane_b32 s45, v11
+; SI-NEXT: v_readfirstlane_b32 s28, v10
+; SI-NEXT: v_readfirstlane_b32 s29, v9
+; SI-NEXT: v_readfirstlane_b32 s26, v8
+; SI-NEXT: v_readfirstlane_b32 s27, v7
+; SI-NEXT: v_readfirstlane_b32 s20, v6
+; SI-NEXT: v_readfirstlane_b32 s21, v5
; SI-NEXT: v_readfirstlane_b32 s14, v4
; SI-NEXT: v_readfirstlane_b32 s15, v3
; SI-NEXT: v_readfirstlane_b32 s8, v2
@@ -24700,24 +24966,24 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 24
; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16
; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8
-; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[16:17], s[20:21], 24
+; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16
+; SI-NEXT: s_lshr_b64 s[20:21], s[20:21], 8
+; SI-NEXT: s_lshr_b64 s[22:23], s[26:27], 24
+; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16
+; SI-NEXT: s_lshr_b64 s[26:27], s[26:27], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
+; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16
+; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 8
+; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24
+; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16
+; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 8
+; SI-NEXT: s_lshr_b64 s[58:59], s[62:63], 24
+; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16
+; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 8
+; SI-NEXT: s_lshr_b64 s[72:73], s[76:77], 24
+; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16
+; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8
; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v1
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v1
@@ -24744,21 +25010,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v20
; SI-NEXT: s_branch .LBB49_5
; SI-NEXT: .LBB49_3:
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr72
; SI-NEXT: ; implicit-def: $sgpr49
; SI-NEXT: ; implicit-def: $sgpr39
; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr52
; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr55
; SI-NEXT: ; implicit-def: $sgpr53
; SI-NEXT: ; implicit-def: $sgpr51
@@ -24777,15 +25043,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr85
; SI-NEXT: ; implicit-def: $sgpr84
; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr78
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr20
+; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr10
@@ -24794,22 +25060,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: s_branch .LBB49_2
; SI-NEXT: .LBB49_4:
-; SI-NEXT: v_mov_b32_e32 v22, s16
-; SI-NEXT: v_mov_b32_e32 v20, s17
-; SI-NEXT: v_mov_b32_e32 v18, s18
-; SI-NEXT: v_mov_b32_e32 v16, s19
-; SI-NEXT: v_mov_b32_e32 v15, s20
-; SI-NEXT: v_mov_b32_e32 v11, s21
-; SI-NEXT: v_mov_b32_e32 v10, s22
-; SI-NEXT: v_mov_b32_e32 v9, s23
-; SI-NEXT: v_mov_b32_e32 v8, s24
-; SI-NEXT: v_mov_b32_e32 v7, s25
-; SI-NEXT: v_mov_b32_e32 v6, s26
-; SI-NEXT: v_mov_b32_e32 v5, s27
-; SI-NEXT: v_mov_b32_e32 v4, s28
-; SI-NEXT: v_mov_b32_e32 v3, s29
-; SI-NEXT: v_mov_b32_e32 v2, s36
-; SI-NEXT: v_mov_b32_e32 v1, s37
+; SI-NEXT: v_mov_b32_e32 v22, s36
+; SI-NEXT: v_mov_b32_e32 v20, s37
+; SI-NEXT: v_mov_b32_e32 v18, s34
+; SI-NEXT: v_mov_b32_e32 v16, s35
+; SI-NEXT: v_mov_b32_e32 v15, s30
+; SI-NEXT: v_mov_b32_e32 v11, s31
+; SI-NEXT: v_mov_b32_e32 v10, s94
+; SI-NEXT: v_mov_b32_e32 v9, s95
+; SI-NEXT: v_mov_b32_e32 v8, s92
+; SI-NEXT: v_mov_b32_e32 v7, s93
+; SI-NEXT: v_mov_b32_e32 v6, s90
+; SI-NEXT: v_mov_b32_e32 v5, s91
+; SI-NEXT: v_mov_b32_e32 v4, s88
+; SI-NEXT: v_mov_b32_e32 v3, s89
+; SI-NEXT: v_mov_b32_e32 v2, s78
+; SI-NEXT: v_mov_b32_e32 v1, s79
; SI-NEXT: v_mov_b32_e32 v48, s49
; SI-NEXT: v_mov_b32_e32 v39, s39
; SI-NEXT: v_mov_b32_e32 v38, s38
@@ -24836,11 +25102,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v12, s82
; SI-NEXT: .LBB49_5: ; %end
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: s_lshl_b32 s5, s34, 8
+; SI-NEXT: s_lshl_b32 s5, s76, 8
; SI-NEXT: v_or_b32_e32 v22, s5, v22
-; SI-NEXT: s_and_b32 s5, s30, 0xff
+; SI-NEXT: s_and_b32 s5, s74, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s94, 24
+; SI-NEXT: s_lshl_b32 s7, s72, 24
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: s_or_b32 s5, s7, s5
; SI-NEXT: v_or_b32_e32 v22, s5, v22
@@ -24851,15 +25117,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v20, v20, v22
; SI-NEXT: v_and_b32_e32 v22, 0xff, v39
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
-; SI-NEXT: s_lshl_b32 s5, s92, 8
+; SI-NEXT: s_lshl_b32 s5, s62, 8
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v38
; SI-NEXT: v_or_b32_e32 v18, s5, v18
-; SI-NEXT: s_and_b32 s5, s90, 0xff
+; SI-NEXT: s_and_b32 s5, s60, 0xff
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
; SI-NEXT: v_or_b32_e32 v22, v38, v22
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s88, 24
+; SI-NEXT: s_lshl_b32 s7, s58, 24
; SI-NEXT: v_or_b32_e32 v20, v20, v22
; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18
@@ -24875,15 +25141,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v16, v16, v18
; SI-NEXT: v_and_b32_e32 v18, 0xff, v36
; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT: s_lshl_b32 s5, s74, 8
+; SI-NEXT: s_lshl_b32 s5, s44, 8
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v35
; SI-NEXT: v_or_b32_e32 v15, s5, v15
-; SI-NEXT: s_and_b32 s5, s62, 0xff
+; SI-NEXT: s_and_b32 s5, s40, 0xff
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s60, 24
+; SI-NEXT: s_lshl_b32 s7, s28, 24
; SI-NEXT: v_or_b32_e32 v16, v16, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
@@ -24899,15 +25165,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v11, v11, v15
; SI-NEXT: v_and_b32_e32 v15, 0xff, v33
; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: s_lshl_b32 s5, s78, 8
+; SI-NEXT: s_lshl_b32 s5, s56, 8
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32
; SI-NEXT: v_or_b32_e32 v10, s5, v10
-; SI-NEXT: s_and_b32 s5, s76, 0xff
+; SI-NEXT: s_and_b32 s5, s46, 0xff
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: v_or_b32_e32 v15, v16, v15
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s72, 24
+; SI-NEXT: s_lshl_b32 s7, s42, 24
; SI-NEXT: v_or_b32_e32 v11, v11, v15
; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
@@ -24923,15 +25189,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xff, v30
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT: s_lshl_b32 s5, s58, 8
+; SI-NEXT: s_lshl_b32 s5, s26, 8
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v29
; SI-NEXT: v_or_b32_e32 v8, s5, v8
-; SI-NEXT: s_and_b32 s5, s56, 0xff
+; SI-NEXT: s_and_b32 s5, s24, 0xff
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v10, v11, v10
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s46, 24
+; SI-NEXT: s_lshl_b32 s7, s22, 24
; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
@@ -24947,15 +25213,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xff, v27
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: s_lshl_b32 s5, s44, 8
+; SI-NEXT: s_lshl_b32 s5, s20, 8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v26
; SI-NEXT: v_or_b32_e32 v6, s5, v6
-; SI-NEXT: s_and_b32 s5, s42, 0xff
+; SI-NEXT: s_and_b32 s5, s18, 0xff
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v9, v8
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s7, s40, 24
+; SI-NEXT: s_lshl_b32 s7, s16, 24
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
@@ -25086,10 +25352,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_writelane_b32 v63, s64, 16
; VI-NEXT: v_writelane_b32 v63, s65, 17
; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -25113,75 +25407,75 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: s_lshr_b32 s59, s5, 8
; VI-NEXT: s_lshr_b32 s58, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s72, s29, 8
-; VI-NEXT: s_lshr_b32 s63, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s77, s27, 8
-; VI-NEXT: s_lshr_b32 s76, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s90, s25, 8
-; VI-NEXT: s_lshr_b32 s89, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s35, s23, 8
-; VI-NEXT: s_lshr_b32 s34, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s48, s21, 8
-; VI-NEXT: s_lshr_b32 s39, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s53, s19, 8
-; VI-NEXT: s_lshr_b32 s52, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s66, s17, 8
-; VI-NEXT: s_lshr_b32 s65, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s72, s7, 8
+; VI-NEXT: s_lshr_b32 s63, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s77, s9, 8
+; VI-NEXT: s_lshr_b32 s76, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s90, s11, 8
+; VI-NEXT: s_lshr_b32 s89, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s35, s13, 8
+; VI-NEXT: s_lshr_b32 s34, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s48, s15, 8
+; VI-NEXT: s_lshr_b32 s39, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s53, s17, 8
+; VI-NEXT: s_lshr_b32 s52, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s66, s19, 8
+; VI-NEXT: s_lshr_b32 s65, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB49_4
; VI-NEXT: .LBB49_2: ; %cmp.true
-; VI-NEXT: v_add_f32_e64 v6, s27, 1.0
-; VI-NEXT: v_add_f32_e64 v5, s26, 1.0
+; VI-NEXT: v_add_f32_e64 v6, s9, 1.0
+; VI-NEXT: v_add_f32_e64 v5, s8, 1.0
; VI-NEXT: v_add_f32_e64 v2, s5, 1.0
; VI-NEXT: v_add_f32_e64 v1, s4, 1.0
-; VI-NEXT: v_add_f32_e64 v8, s25, 1.0
-; VI-NEXT: v_add_f32_e64 v7, s24, 1.0
+; VI-NEXT: v_add_f32_e64 v8, s11, 1.0
+; VI-NEXT: v_add_f32_e64 v7, s10, 1.0
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2]
; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6]
-; VI-NEXT: v_add_f32_e64 v10, s23, 1.0
-; VI-NEXT: v_add_f32_e64 v9, s22, 1.0
+; VI-NEXT: v_add_f32_e64 v10, s13, 1.0
+; VI-NEXT: v_add_f32_e64 v9, s12, 1.0
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_add_f32_e64 v12, s21, 1.0
-; VI-NEXT: v_add_f32_e64 v11, s20, 1.0
-; VI-NEXT: v_add_f32_e64 v4, s29, 1.0
-; VI-NEXT: v_add_f32_e64 v3, s28, 1.0
+; VI-NEXT: v_add_f32_e64 v12, s15, 1.0
+; VI-NEXT: v_add_f32_e64 v11, s14, 1.0
+; VI-NEXT: v_add_f32_e64 v4, s7, 1.0
+; VI-NEXT: v_add_f32_e64 v3, s6, 1.0
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10]
-; VI-NEXT: v_add_f32_e64 v16, s19, 1.0
-; VI-NEXT: v_add_f32_e64 v15, s18, 1.0
+; VI-NEXT: v_add_f32_e64 v16, s17, 1.0
+; VI-NEXT: v_add_f32_e64 v15, s16, 1.0
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12]
-; VI-NEXT: v_add_f32_e64 v18, s17, 1.0
-; VI-NEXT: v_add_f32_e64 v17, s16, 1.0
+; VI-NEXT: v_add_f32_e64 v18, s19, 1.0
+; VI-NEXT: v_add_f32_e64 v17, s18, 1.0
; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16]
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -25230,31 +25524,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: .LBB49_3:
; VI-NEXT: ; implicit-def: $sgpr67
; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr20
; VI-NEXT: ; implicit-def: $sgpr66
; VI-NEXT: ; implicit-def: $sgpr64
; VI-NEXT: ; implicit-def: $sgpr55
; VI-NEXT: ; implicit-def: $sgpr54
; VI-NEXT: ; implicit-def: $sgpr52
-; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: ; implicit-def: $sgpr53
; VI-NEXT: ; implicit-def: $sgpr51
; VI-NEXT: ; implicit-def: $sgpr50
; VI-NEXT: ; implicit-def: $sgpr49
; VI-NEXT: ; implicit-def: $sgpr39
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr48
; VI-NEXT: ; implicit-def: $sgpr38
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr36
; VI-NEXT: ; implicit-def: $sgpr34
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr35
; VI-NEXT: ; implicit-def: $sgpr31
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr89
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr90
; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr79
@@ -25282,20 +25576,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v20, s42
-; VI-NEXT: v_mov_b32_e32 v17, s16
-; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_mov_b32_e32 v15, s18
-; VI-NEXT: v_mov_b32_e32 v16, s19
-; VI-NEXT: v_mov_b32_e32 v11, s20
-; VI-NEXT: v_mov_b32_e32 v12, s21
-; VI-NEXT: v_mov_b32_e32 v9, s22
-; VI-NEXT: v_mov_b32_e32 v10, s23
-; VI-NEXT: v_mov_b32_e32 v7, s24
-; VI-NEXT: v_mov_b32_e32 v8, s25
-; VI-NEXT: v_mov_b32_e32 v5, s26
-; VI-NEXT: v_mov_b32_e32 v6, s27
-; VI-NEXT: v_mov_b32_e32 v3, s28
-; VI-NEXT: v_mov_b32_e32 v4, s29
+; VI-NEXT: v_mov_b32_e32 v17, s18
+; VI-NEXT: v_mov_b32_e32 v18, s19
+; VI-NEXT: v_mov_b32_e32 v15, s16
+; VI-NEXT: v_mov_b32_e32 v16, s17
+; VI-NEXT: v_mov_b32_e32 v11, s14
+; VI-NEXT: v_mov_b32_e32 v12, s15
+; VI-NEXT: v_mov_b32_e32 v9, s12
+; VI-NEXT: v_mov_b32_e32 v10, s13
+; VI-NEXT: v_mov_b32_e32 v7, s10
+; VI-NEXT: v_mov_b32_e32 v8, s11
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v6, s9
+; VI-NEXT: v_mov_b32_e32 v3, s6
+; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v19, s67
@@ -25338,14 +25632,14 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v27, s59
; VI-NEXT: v_mov_b32_e32 v14, s57
; VI-NEXT: v_mov_b32_e32 v26, s56
-; VI-NEXT: v_mov_b32_e32 v22, s12
-; VI-NEXT: v_mov_b32_e32 v23, s10
-; VI-NEXT: v_mov_b32_e32 v24, s8
-; VI-NEXT: v_mov_b32_e32 v25, s6
+; VI-NEXT: v_mov_b32_e32 v22, s26
+; VI-NEXT: v_mov_b32_e32 v23, s24
+; VI-NEXT: v_mov_b32_e32 v24, s22
+; VI-NEXT: v_mov_b32_e32 v25, s20
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v20, s40
-; VI-NEXT: v_mov_b32_e32 v21, s14
+; VI-NEXT: v_mov_b32_e32 v21, s28
; VI-NEXT: .LBB49_5: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19
@@ -25434,21 +25728,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_readlane_b32 s67, v63, 19
; VI-NEXT: v_readlane_b32 s66, v63, 18
; VI-NEXT: v_readlane_b32 s65, v63, 17
@@ -25469,7 +25748,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25499,6 +25778,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25526,10 +25820,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v63, s52, 12
; GFX9-NEXT: v_writelane_b32 v63, s53, 13
; GFX9-NEXT: v_writelane_b32 v63, s54, 14
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX9-NEXT: v_writelane_b32 v63, s55, 15
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -25553,76 +25875,76 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s59, s5, 8
; GFX9-NEXT: s_lshr_b32 s58, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s72, s29, 8
-; GFX9-NEXT: s_lshr_b32 s63, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s77, s27, 8
-; GFX9-NEXT: s_lshr_b32 s76, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s90, s25, 8
-; GFX9-NEXT: s_lshr_b32 s89, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s95, s23, 8
-; GFX9-NEXT: s_lshr_b32 s94, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s36, s21, 8
-; GFX9-NEXT: s_lshr_b32 s35, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s49, s19, 8
-; GFX9-NEXT: s_lshr_b32 s48, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s54, s17, 8
-; GFX9-NEXT: s_lshr_b32 s53, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s72, s7, 8
+; GFX9-NEXT: s_lshr_b32 s63, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s77, s9, 8
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s90, s11, 8
+; GFX9-NEXT: s_lshr_b32 s89, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s95, s13, 8
+; GFX9-NEXT: s_lshr_b32 s94, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s36, s15, 8
+; GFX9-NEXT: s_lshr_b32 s35, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s49, s17, 8
+; GFX9-NEXT: s_lshr_b32 s48, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s54, s19, 8
+; GFX9-NEXT: s_lshr_b32 s53, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB49_4
; GFX9-NEXT: .LBB49_2: ; %cmp.true
-; GFX9-NEXT: v_add_f32_e64 v6, s27, 1.0
-; GFX9-NEXT: v_add_f32_e64 v5, s26, 1.0
+; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0
+; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0
; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0
; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0
-; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0
-; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0
+; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0
+; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0
; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2]
; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6]
-; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0
-; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0
+; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0
+; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0
; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8]
-; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0
-; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0
-; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0
-; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0
+; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0
+; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0
+; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0
+; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10]
-; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0
-; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0
+; GFX9-NEXT: v_add_f32_e64 v16, s17, 1.0
+; GFX9-NEXT: v_add_f32_e64 v15, s16, 1.0
; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12]
-; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0
-; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0
+; GFX9-NEXT: v_add_f32_e64 v20, s19, 1.0
+; GFX9-NEXT: v_add_f32_e64 v19, s18, 1.0
; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -25672,31 +25994,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: .LBB49_3:
; GFX9-NEXT: ; implicit-def: $sgpr55
; GFX9-NEXT: ; implicit-def: $sgpr53
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr54
; GFX9-NEXT: ; implicit-def: $sgpr52
; GFX9-NEXT: ; implicit-def: $sgpr51
; GFX9-NEXT: ; implicit-def: $sgpr50
; GFX9-NEXT: ; implicit-def: $sgpr48
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr49
; GFX9-NEXT: ; implicit-def: $sgpr39
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr37
; GFX9-NEXT: ; implicit-def: $sgpr35
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr31
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr94
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr89
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
@@ -25725,20 +26047,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s42
-; GFX9-NEXT: v_mov_b32_e32 v19, s16
-; GFX9-NEXT: v_mov_b32_e32 v20, s17
-; GFX9-NEXT: v_mov_b32_e32 v15, s18
-; GFX9-NEXT: v_mov_b32_e32 v16, s19
-; GFX9-NEXT: v_mov_b32_e32 v11, s20
-; GFX9-NEXT: v_mov_b32_e32 v12, s21
-; GFX9-NEXT: v_mov_b32_e32 v9, s22
-; GFX9-NEXT: v_mov_b32_e32 v10, s23
-; GFX9-NEXT: v_mov_b32_e32 v7, s24
-; GFX9-NEXT: v_mov_b32_e32 v8, s25
-; GFX9-NEXT: v_mov_b32_e32 v5, s26
-; GFX9-NEXT: v_mov_b32_e32 v6, s27
-; GFX9-NEXT: v_mov_b32_e32 v3, s28
-; GFX9-NEXT: v_mov_b32_e32 v4, s29
+; GFX9-NEXT: v_mov_b32_e32 v19, s18
+; GFX9-NEXT: v_mov_b32_e32 v20, s19
+; GFX9-NEXT: v_mov_b32_e32 v15, s16
+; GFX9-NEXT: v_mov_b32_e32 v16, s17
+; GFX9-NEXT: v_mov_b32_e32 v11, s14
+; GFX9-NEXT: v_mov_b32_e32 v12, s15
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: v_mov_b32_e32 v10, s13
+; GFX9-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v17, s55
@@ -25781,15 +26103,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v27, s59
; GFX9-NEXT: v_mov_b32_e32 v14, s57
; GFX9-NEXT: v_mov_b32_e32 v18, s56
-; GFX9-NEXT: v_mov_b32_e32 v23, s12
-; GFX9-NEXT: v_mov_b32_e32 v24, s10
-; GFX9-NEXT: v_mov_b32_e32 v25, s8
-; GFX9-NEXT: v_mov_b32_e32 v26, s6
+; GFX9-NEXT: v_mov_b32_e32 v23, s26
+; GFX9-NEXT: v_mov_b32_e32 v24, s24
+; GFX9-NEXT: v_mov_b32_e32 v25, s22
+; GFX9-NEXT: v_mov_b32_e32 v26, s20
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s40
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s28
; GFX9-NEXT: .LBB49_5: ; %end
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -25867,21 +26189,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -25898,7 +26205,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(16)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25924,6 +26231,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -31414,112 +31736,110 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s5, v1
+; SI-NEXT: v_mov_b32_e32 v33, v1
+; SI-NEXT: v_mov_b32_e32 v32, v0
+; SI-NEXT: v_mov_b32_e32 v34, s16
+; SI-NEXT: v_mov_b32_e32 v35, s17
+; SI-NEXT: v_mov_b32_e32 v36, s18
+; SI-NEXT: v_mov_b32_e32 v37, s19
+; SI-NEXT: v_mov_b32_e32 v38, s20
+; SI-NEXT: v_mov_b32_e32 v39, s21
+; SI-NEXT: v_mov_b32_e32 v48, s22
+; SI-NEXT: v_mov_b32_e32 v49, s23
+; SI-NEXT: v_mov_b32_e32 v50, s24
+; SI-NEXT: v_mov_b32_e32 v51, s25
+; SI-NEXT: v_mov_b32_e32 v52, s26
+; SI-NEXT: v_mov_b32_e32 v53, s27
+; SI-NEXT: v_mov_b32_e32 v54, s28
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mov_b32_e32 v55, s29
; SI-NEXT: s_cbranch_scc0 .LBB57_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s56, s5, 16
-; SI-NEXT: s_lshr_b32 s57, s29, 16
-; SI-NEXT: s_lshr_b32 s58, s27, 16
-; SI-NEXT: s_lshr_b32 s59, s25, 16
-; SI-NEXT: s_lshr_b32 s60, s23, 16
-; SI-NEXT: s_lshr_b32 s61, s21, 16
-; SI-NEXT: s_lshr_b32 s62, s19, 16
-; SI-NEXT: s_lshr_b32 s63, s17, 16
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35
+; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16
; SI-NEXT: s_cbranch_execnz .LBB57_3
; SI-NEXT: .LBB57_2: ; %cmp.true
-; SI-NEXT: s_add_u32 s4, s4, 3
-; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: s_add_u32 s28, s28, 3
-; SI-NEXT: s_addc_u32 s29, s29, 0
-; SI-NEXT: s_add_u32 s26, s26, 3
-; SI-NEXT: s_addc_u32 s27, s27, 0
-; SI-NEXT: s_add_u32 s24, s24, 3
-; SI-NEXT: s_addc_u32 s25, s25, 0
-; SI-NEXT: s_add_u32 s22, s22, 3
-; SI-NEXT: s_addc_u32 s23, s23, 0
-; SI-NEXT: s_add_u32 s20, s20, 3
-; SI-NEXT: s_addc_u32 s21, s21, 0
-; SI-NEXT: s_add_u32 s18, s18, 3
-; SI-NEXT: s_addc_u32 s19, s19, 0
-; SI-NEXT: s_add_u32 s16, s16, 3
-; SI-NEXT: s_addc_u32 s17, s17, 0
-; SI-NEXT: s_lshr_b32 s56, s5, 16
-; SI-NEXT: s_lshr_b32 s57, s29, 16
-; SI-NEXT: s_lshr_b32 s58, s27, 16
-; SI-NEXT: s_lshr_b32 s59, s25, 16
-; SI-NEXT: s_lshr_b32 s60, s23, 16
-; SI-NEXT: s_lshr_b32 s61, s21, 16
-; SI-NEXT: s_lshr_b32 s62, s19, 16
-; SI-NEXT: s_lshr_b32 s63, s17, 16
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
+; SI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc
+; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54
+; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc
+; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52
+; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc
+; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50
+; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc
+; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
+; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc
+; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38
+; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc
+; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36
+; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc
+; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
+; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc
+; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35
; SI-NEXT: .LBB57_3: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: v_mov_b32_e32 v1, s44
-; SI-NEXT: v_mov_b32_e32 v2, s17
-; SI-NEXT: v_mov_b32_e32 v3, s63
-; SI-NEXT: v_mov_b32_e32 v4, s18
-; SI-NEXT: v_mov_b32_e32 v5, s42
-; SI-NEXT: v_mov_b32_e32 v6, s19
-; SI-NEXT: v_mov_b32_e32 v7, s62
-; SI-NEXT: v_mov_b32_e32 v8, s20
-; SI-NEXT: v_mov_b32_e32 v9, s40
-; SI-NEXT: v_mov_b32_e32 v10, s21
-; SI-NEXT: v_mov_b32_e32 v11, s61
-; SI-NEXT: v_mov_b32_e32 v12, s22
-; SI-NEXT: v_mov_b32_e32 v13, s14
-; SI-NEXT: v_mov_b32_e32 v14, s23
-; SI-NEXT: v_mov_b32_e32 v15, s60
-; SI-NEXT: v_mov_b32_e32 v16, s24
-; SI-NEXT: v_mov_b32_e32 v17, s12
-; SI-NEXT: v_mov_b32_e32 v18, s25
-; SI-NEXT: v_mov_b32_e32 v19, s59
-; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_mov_b32_e32 v21, s10
-; SI-NEXT: v_mov_b32_e32 v22, s27
-; SI-NEXT: v_mov_b32_e32 v23, s58
-; SI-NEXT: v_mov_b32_e32 v24, s28
-; SI-NEXT: v_mov_b32_e32 v25, s8
-; SI-NEXT: v_mov_b32_e32 v26, s29
-; SI-NEXT: v_mov_b32_e32 v27, s57
-; SI-NEXT: v_mov_b32_e32 v28, s4
-; SI-NEXT: v_mov_b32_e32 v29, s6
-; SI-NEXT: v_mov_b32_e32 v30, s5
-; SI-NEXT: v_mov_b32_e32 v31, s56
+; SI-NEXT: v_mov_b32_e32 v0, v34
+; SI-NEXT: v_mov_b32_e32 v2, v35
+; SI-NEXT: v_mov_b32_e32 v4, v36
+; SI-NEXT: v_mov_b32_e32 v6, v37
+; SI-NEXT: v_mov_b32_e32 v8, v38
+; SI-NEXT: v_mov_b32_e32 v10, v39
+; SI-NEXT: v_mov_b32_e32 v12, v48
+; SI-NEXT: v_mov_b32_e32 v14, v49
+; SI-NEXT: v_mov_b32_e32 v16, v50
+; SI-NEXT: v_mov_b32_e32 v18, v51
+; SI-NEXT: v_mov_b32_e32 v20, v52
+; SI-NEXT: v_mov_b32_e32 v22, v53
+; SI-NEXT: v_mov_b32_e32 v24, v54
+; SI-NEXT: v_mov_b32_e32 v26, v55
+; SI-NEXT: v_mov_b32_e32 v28, v32
+; SI-NEXT: v_mov_b32_e32 v30, v33
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB57_4:
-; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: s_branch .LBB57_2
;
; VI-LABEL: bitcast_v8i64_to_v32i16_scalar:
@@ -32152,111 +32472,139 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
; VI-LABEL: bitcast_v32i16_to_v8i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s6, v3
+; VI-NEXT: v_readfirstlane_b32 s7, v4
+; VI-NEXT: v_readfirstlane_b32 s8, v5
+; VI-NEXT: v_readfirstlane_b32 s9, v6
+; VI-NEXT: v_readfirstlane_b32 s10, v7
+; VI-NEXT: v_readfirstlane_b32 s11, v8
+; VI-NEXT: v_readfirstlane_b32 s12, v9
+; VI-NEXT: v_readfirstlane_b32 s13, v10
+; VI-NEXT: v_readfirstlane_b32 s14, v11
+; VI-NEXT: v_readfirstlane_b32 s15, v12
+; VI-NEXT: v_readfirstlane_b32 s16, v13
+; VI-NEXT: v_readfirstlane_b32 s17, v14
+; VI-NEXT: v_readfirstlane_b32 s18, v15
+; VI-NEXT: v_readfirstlane_b32 s19, v16
+; VI-NEXT: v_readfirstlane_b32 s20, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v1
; VI-NEXT: s_cbranch_scc0 .LBB59_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB59_3
; VI-NEXT: .LBB59_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s5, s7, 3
-; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s6, 3
-; VI-NEXT: s_add_i32 s7, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s20, 3
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s29, 3
-; VI-NEXT: s_add_i32 s6, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s19, 3
+; VI-NEXT: s_add_i32 s20, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s28, 3
-; VI-NEXT: s_add_i32 s29, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s18, 3
+; VI-NEXT: s_add_i32 s19, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s27, 3
-; VI-NEXT: s_add_i32 s28, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s17, 3
+; VI-NEXT: s_add_i32 s18, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s26, 3
-; VI-NEXT: s_add_i32 s27, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s16, 3
+; VI-NEXT: s_add_i32 s17, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s25, 3
-; VI-NEXT: s_add_i32 s26, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s15, 3
+; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s15, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s24, 3
-; VI-NEXT: s_add_i32 s25, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s14, 3
+; VI-NEXT: s_add_i32 s15, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s14, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s23, 3
-; VI-NEXT: s_add_i32 s24, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s13, 3
+; VI-NEXT: s_add_i32 s14, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s13, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s22, 3
-; VI-NEXT: s_add_i32 s23, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s12, 3
+; VI-NEXT: s_add_i32 s13, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s12, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s21, 3
-; VI-NEXT: s_add_i32 s22, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s11, 3
+; VI-NEXT: s_add_i32 s12, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s11, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s20, 3
-; VI-NEXT: s_add_i32 s21, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s10, 3
+; VI-NEXT: s_add_i32 s11, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s10, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s19, 3
-; VI-NEXT: s_add_i32 s20, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s9, 3
+; VI-NEXT: s_add_i32 s10, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s9, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s18, 3
-; VI-NEXT: s_add_i32 s19, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s8, 3
+; VI-NEXT: s_add_i32 s9, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s8, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s17, 3
-; VI-NEXT: s_add_i32 s18, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s7, 3
+; VI-NEXT: s_add_i32 s8, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_add_i32 s17, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s6, 3
+; VI-NEXT: s_add_i32 s7, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s6, s4, 0x30000
; VI-NEXT: .LBB59_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_mov_b32_e32 v7, s13
+; VI-NEXT: v_mov_b32_e32 v8, s14
+; VI-NEXT: v_mov_b32_e32 v9, s15
+; VI-NEXT: v_mov_b32_e32 v10, s16
+; VI-NEXT: v_mov_b32_e32 v11, s17
+; VI-NEXT: v_mov_b32_e32 v12, s18
+; VI-NEXT: v_mov_b32_e32 v13, s19
+; VI-NEXT: v_mov_b32_e32 v14, s20
+; VI-NEXT: v_mov_b32_e32 v15, s21
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB59_4:
; VI-NEXT: s_branch .LBB59_2
@@ -32677,108 +33025,136 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32
; SI-LABEL: bitcast_v8i64_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: v_mov_b32_e32 v4, s17
+; SI-NEXT: v_mov_b32_e32 v5, s18
+; SI-NEXT: v_mov_b32_e32 v6, s19
+; SI-NEXT: v_mov_b32_e32 v7, s20
+; SI-NEXT: v_mov_b32_e32 v8, s21
+; SI-NEXT: v_mov_b32_e32 v9, s22
+; SI-NEXT: v_mov_b32_e32 v10, s23
+; SI-NEXT: v_mov_b32_e32 v11, s24
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s26
+; SI-NEXT: v_mov_b32_e32 v14, s27
+; SI-NEXT: v_mov_b32_e32 v15, s28
+; SI-NEXT: v_mov_b32_e32 v16, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: v_readfirstlane_b32 s20, v3
+; SI-NEXT: v_readfirstlane_b32 s21, v4
+; SI-NEXT: v_readfirstlane_b32 s18, v5
+; SI-NEXT: v_readfirstlane_b32 s19, v6
+; SI-NEXT: v_readfirstlane_b32 s16, v7
+; SI-NEXT: v_readfirstlane_b32 s17, v8
+; SI-NEXT: v_readfirstlane_b32 s14, v9
+; SI-NEXT: v_readfirstlane_b32 s15, v10
+; SI-NEXT: v_readfirstlane_b32 s12, v11
+; SI-NEXT: v_readfirstlane_b32 s13, v12
+; SI-NEXT: v_readfirstlane_b32 s10, v13
+; SI-NEXT: v_readfirstlane_b32 s11, v14
+; SI-NEXT: v_readfirstlane_b32 s7, v15
+; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: v_readfirstlane_b32 s6, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s7, v1
+; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: s_cbranch_scc0 .LBB61_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s7, 16
+; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v31, s4
; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT: s_lshr_b32 s4, s29, 16
+; SI-NEXT: s_lshr_b32 s4, s8, 16
; SI-NEXT: v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT: s_lshr_b32 s4, s28, 16
+; SI-NEXT: s_lshr_b32 s4, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT: s_lshr_b32 s4, s27, 16
+; SI-NEXT: s_lshr_b32 s4, s11, 16
; SI-NEXT: v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT: s_lshr_b32 s4, s26, 16
+; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT: s_lshr_b32 s4, s25, 16
+; SI-NEXT: s_lshr_b32 s4, s13, 16
; SI-NEXT: v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT: s_lshr_b32 s4, s24, 16
+; SI-NEXT: s_lshr_b32 s4, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v17, s4
-; SI-NEXT: s_lshr_b32 s4, s23, 16
+; SI-NEXT: s_lshr_b32 s4, s15, 16
; SI-NEXT: v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT: s_lshr_b32 s4, s22, 16
+; SI-NEXT: s_lshr_b32 s4, s14, 16
; SI-NEXT: v_cvt_f32_f16_e32 v13, s4
-; SI-NEXT: s_lshr_b32 s4, s21, 16
+; SI-NEXT: s_lshr_b32 s4, s17, 16
; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT: s_lshr_b32 s4, s20, 16
+; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: v_cvt_f32_f16_e32 v9, s4
; SI-NEXT: s_lshr_b32 s4, s19, 16
; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: s_lshr_b32 s4, s18, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT: s_lshr_b32 s4, s17, 16
+; SI-NEXT: s_lshr_b32 s4, s21, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: s_lshr_b32 s4, s16, 16
+; SI-NEXT: s_lshr_b32 s4, s20, 16
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v30, s9
; SI-NEXT: v_cvt_f32_f16_e32 v28, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
+; SI-NEXT: v_cvt_f32_f16_e32 v26, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v24, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v22, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v20, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v16, s12
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s15
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s14
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s16
; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s21
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s20
; SI-NEXT: s_cbranch_execnz .LBB61_3
; SI-NEXT: .LBB61_2: ; %cmp.true
-; SI-NEXT: s_add_u32 s4, s16, 3
-; SI-NEXT: s_addc_u32 s5, s17, 0
-; SI-NEXT: s_lshr_b32 s8, s4, 16
-; SI-NEXT: s_lshr_b32 s9, s5, 16
-; SI-NEXT: s_add_u32 s10, s18, 3
-; SI-NEXT: s_addc_u32 s11, s19, 0
-; SI-NEXT: s_lshr_b32 s12, s10, 16
-; SI-NEXT: s_lshr_b32 s13, s11, 16
-; SI-NEXT: s_add_u32 s14, s20, 3
-; SI-NEXT: s_addc_u32 s15, s21, 0
-; SI-NEXT: s_lshr_b32 s16, s14, 16
-; SI-NEXT: s_lshr_b32 s17, s15, 16
-; SI-NEXT: s_add_u32 s18, s22, 3
-; SI-NEXT: s_addc_u32 s19, s23, 0
-; SI-NEXT: s_lshr_b32 s20, s18, 16
-; SI-NEXT: s_lshr_b32 s21, s19, 16
-; SI-NEXT: s_add_u32 s22, s24, 3
-; SI-NEXT: s_addc_u32 s23, s25, 0
-; SI-NEXT: s_lshr_b32 s24, s22, 16
-; SI-NEXT: s_lshr_b32 s25, s23, 16
-; SI-NEXT: s_add_u32 s26, s26, 3
-; SI-NEXT: s_addc_u32 s27, s27, 0
-; SI-NEXT: s_lshr_b32 s40, s26, 16
-; SI-NEXT: s_lshr_b32 s41, s27, 16
-; SI-NEXT: s_add_u32 s28, s28, 3
-; SI-NEXT: s_addc_u32 s29, s29, 0
-; SI-NEXT: s_lshr_b32 s42, s28, 16
-; SI-NEXT: s_lshr_b32 s43, s29, 16
+; SI-NEXT: s_add_u32 s4, s20, 3
+; SI-NEXT: s_addc_u32 s5, s21, 0
+; SI-NEXT: s_lshr_b32 s20, s4, 16
+; SI-NEXT: s_lshr_b32 s21, s5, 16
+; SI-NEXT: s_add_u32 s18, s18, 3
+; SI-NEXT: s_addc_u32 s19, s19, 0
+; SI-NEXT: s_lshr_b32 s22, s18, 16
+; SI-NEXT: s_lshr_b32 s23, s19, 16
+; SI-NEXT: s_add_u32 s16, s16, 3
+; SI-NEXT: s_addc_u32 s17, s17, 0
+; SI-NEXT: s_lshr_b32 s24, s16, 16
+; SI-NEXT: s_lshr_b32 s25, s17, 16
+; SI-NEXT: s_add_u32 s14, s14, 3
+; SI-NEXT: s_addc_u32 s15, s15, 0
+; SI-NEXT: s_lshr_b32 s26, s14, 16
+; SI-NEXT: s_lshr_b32 s27, s15, 16
+; SI-NEXT: s_add_u32 s12, s12, 3
+; SI-NEXT: s_addc_u32 s13, s13, 0
+; SI-NEXT: s_lshr_b32 s28, s12, 16
+; SI-NEXT: s_lshr_b32 s29, s13, 16
+; SI-NEXT: s_add_u32 s10, s10, 3
+; SI-NEXT: s_addc_u32 s11, s11, 0
+; SI-NEXT: s_lshr_b32 s40, s10, 16
+; SI-NEXT: s_lshr_b32 s41, s11, 16
+; SI-NEXT: s_add_u32 s7, s7, 3
+; SI-NEXT: s_addc_u32 s8, s8, 0
+; SI-NEXT: s_lshr_b32 s42, s7, 16
+; SI-NEXT: s_lshr_b32 s43, s8, 16
; SI-NEXT: s_add_u32 s6, s6, 3
-; SI-NEXT: s_addc_u32 s7, s7, 0
+; SI-NEXT: s_addc_u32 s9, s9, 0
; SI-NEXT: s_lshr_b32 s44, s6, 16
-; SI-NEXT: s_lshr_b32 s45, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s7
+; SI-NEXT: s_lshr_b32 s45, s9, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v30, s9
; SI-NEXT: v_cvt_f32_f16_e32 v28, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s15
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s14
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s11
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v26, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v24, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v22, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v20, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v16, s12
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s15
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s14
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
; SI-NEXT: v_cvt_f32_f16_e32 v31, s45
@@ -32787,16 +33163,16 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32
; SI-NEXT: v_cvt_f32_f16_e32 v25, s42
; SI-NEXT: v_cvt_f32_f16_e32 v23, s41
; SI-NEXT: v_cvt_f32_f16_e32 v21, s40
-; SI-NEXT: v_cvt_f32_f16_e32 v19, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v15, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v9, s16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s13
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s12
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v19, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v17, s28
+; SI-NEXT: v_cvt_f32_f16_e32 v15, s27
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s26
+; SI-NEXT: v_cvt_f32_f16_e32 v11, s25
+; SI-NEXT: v_cvt_f32_f16_e32 v9, s24
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s23
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s22
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s21
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s20
; SI-NEXT: .LBB61_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB61_4:
@@ -34066,7 +34442,35 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a,
; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: v_mov_b32_e32 v4, s17
+; SI-NEXT: v_mov_b32_e32 v5, s18
+; SI-NEXT: v_mov_b32_e32 v6, s19
+; SI-NEXT: v_mov_b32_e32 v7, s20
+; SI-NEXT: v_mov_b32_e32 v8, s21
+; SI-NEXT: v_mov_b32_e32 v9, s22
+; SI-NEXT: v_mov_b32_e32 v10, s23
+; SI-NEXT: v_mov_b32_e32 v11, s24
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s26
+; SI-NEXT: v_mov_b32_e32 v14, s27
+; SI-NEXT: v_mov_b32_e32 v15, s28
+; SI-NEXT: v_mov_b32_e32 v16, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: v_readfirstlane_b32 s56, v3
+; SI-NEXT: v_readfirstlane_b32 s57, v4
+; SI-NEXT: v_readfirstlane_b32 s58, v5
+; SI-NEXT: v_readfirstlane_b32 s59, v6
+; SI-NEXT: v_readfirstlane_b32 s60, v7
+; SI-NEXT: v_readfirstlane_b32 s61, v8
+; SI-NEXT: v_readfirstlane_b32 s62, v9
+; SI-NEXT: v_readfirstlane_b32 s63, v10
+; SI-NEXT: v_readfirstlane_b32 s72, v11
+; SI-NEXT: v_readfirstlane_b32 s73, v12
+; SI-NEXT: v_readfirstlane_b32 s74, v13
+; SI-NEXT: v_readfirstlane_b32 s75, v14
+; SI-NEXT: v_readfirstlane_b32 s76, v15
+; SI-NEXT: v_readfirstlane_b32 s77, v16
; SI-NEXT: v_readfirstlane_b32 s78, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s79, v1
@@ -34076,50 +34480,50 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a,
; SI-NEXT: s_lshl_b32 s7, s79, 16
; SI-NEXT: s_and_b32 s8, s78, 0xffff0000
; SI-NEXT: s_lshl_b32 s9, s78, 16
-; SI-NEXT: s_and_b32 s10, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s11, s29, 16
-; SI-NEXT: s_and_b32 s12, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s13, s28, 16
-; SI-NEXT: s_and_b32 s14, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s15, s27, 16
-; SI-NEXT: s_and_b32 s40, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s41, s26, 16
-; SI-NEXT: s_and_b32 s42, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s43, s25, 16
-; SI-NEXT: s_and_b32 s44, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s45, s24, 16
-; SI-NEXT: s_and_b32 s46, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s47, s23, 16
-; SI-NEXT: s_and_b32 s56, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s57, s22, 16
-; SI-NEXT: s_and_b32 s58, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s21, 16
-; SI-NEXT: s_and_b32 s60, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s20, 16
-; SI-NEXT: s_and_b32 s62, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s19, 16
-; SI-NEXT: s_and_b32 s72, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s73, s18, 16
-; SI-NEXT: s_and_b32 s74, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s17, 16
-; SI-NEXT: s_and_b32 s76, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s77, s16, 16
+; SI-NEXT: s_and_b32 s10, s77, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s77, 16
+; SI-NEXT: s_and_b32 s12, s76, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s76, 16
+; SI-NEXT: s_and_b32 s14, s75, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s75, 16
+; SI-NEXT: s_and_b32 s16, s74, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s74, 16
+; SI-NEXT: s_and_b32 s18, s73, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s73, 16
+; SI-NEXT: s_and_b32 s20, s72, 0xffff0000
+; SI-NEXT: s_lshl_b32 s21, s72, 16
+; SI-NEXT: s_and_b32 s22, s63, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s63, 16
+; SI-NEXT: s_and_b32 s24, s62, 0xffff0000
+; SI-NEXT: s_lshl_b32 s25, s62, 16
+; SI-NEXT: s_and_b32 s26, s61, 0xffff0000
+; SI-NEXT: s_lshl_b32 s27, s61, 16
+; SI-NEXT: s_and_b32 s28, s60, 0xffff0000
+; SI-NEXT: s_lshl_b32 s29, s60, 16
+; SI-NEXT: s_and_b32 s40, s59, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s59, 16
+; SI-NEXT: s_and_b32 s42, s58, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s58, 16
+; SI-NEXT: s_and_b32 s44, s57, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s57, 16
+; SI-NEXT: s_and_b32 s46, s56, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s56, 16
; SI-NEXT: s_cbranch_execnz .LBB65_3
; SI-NEXT: .LBB65_2: ; %cmp.true
-; SI-NEXT: s_add_u32 s4, s16, 3
-; SI-NEXT: s_addc_u32 s5, s17, 0
-; SI-NEXT: s_add_u32 s16, s18, 3
-; SI-NEXT: s_addc_u32 s17, s19, 0
-; SI-NEXT: s_add_u32 s18, s20, 3
-; SI-NEXT: s_addc_u32 s19, s21, 0
-; SI-NEXT: s_add_u32 s20, s22, 3
-; SI-NEXT: s_addc_u32 s21, s23, 0
-; SI-NEXT: s_add_u32 s22, s24, 3
-; SI-NEXT: s_addc_u32 s23, s25, 0
-; SI-NEXT: s_add_u32 s24, s26, 3
-; SI-NEXT: s_addc_u32 s15, s27, 0
-; SI-NEXT: s_add_u32 s13, s28, 3
-; SI-NEXT: s_addc_u32 s11, s29, 0
+; SI-NEXT: s_add_u32 s4, s56, 3
+; SI-NEXT: s_addc_u32 s5, s57, 0
+; SI-NEXT: s_add_u32 s43, s58, 3
+; SI-NEXT: s_addc_u32 s41, s59, 0
+; SI-NEXT: s_add_u32 s29, s60, 3
+; SI-NEXT: s_addc_u32 s27, s61, 0
+; SI-NEXT: s_add_u32 s25, s62, 3
+; SI-NEXT: s_addc_u32 s23, s63, 0
+; SI-NEXT: s_add_u32 s21, s72, 3
+; SI-NEXT: s_addc_u32 s19, s73, 0
+; SI-NEXT: s_add_u32 s17, s74, 3
+; SI-NEXT: s_addc_u32 s15, s75, 0
+; SI-NEXT: s_add_u32 s13, s76, 3
+; SI-NEXT: s_addc_u32 s11, s77, 0
; SI-NEXT: s_add_u32 s9, s78, 3
; SI-NEXT: s_addc_u32 s7, s79, 0
; SI-NEXT: s_and_b32 s6, s7, 0xffff0000
@@ -34132,51 +34536,51 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a,
; SI-NEXT: s_lshl_b32 s13, s13, 16
; SI-NEXT: s_and_b32 s14, s15, 0xffff0000
; SI-NEXT: s_lshl_b32 s15, s15, 16
-; SI-NEXT: s_and_b32 s40, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s41, s24, 16
-; SI-NEXT: s_and_b32 s42, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s43, s23, 16
-; SI-NEXT: s_and_b32 s44, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s45, s22, 16
-; SI-NEXT: s_and_b32 s46, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s47, s21, 16
-; SI-NEXT: s_and_b32 s56, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s57, s20, 16
-; SI-NEXT: s_and_b32 s58, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s19, 16
-; SI-NEXT: s_and_b32 s60, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s18, 16
-; SI-NEXT: s_and_b32 s62, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s17, 16
-; SI-NEXT: s_and_b32 s72, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s73, s16, 16
-; SI-NEXT: s_and_b32 s74, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s5, 16
-; SI-NEXT: s_and_b32 s76, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s77, s4, 16
+; SI-NEXT: s_and_b32 s16, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_and_b32 s18, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s19, 16
+; SI-NEXT: s_and_b32 s20, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_and_b32 s22, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s23, 16
+; SI-NEXT: s_and_b32 s24, s25, 0xffff0000
+; SI-NEXT: s_lshl_b32 s25, s25, 16
+; SI-NEXT: s_and_b32 s26, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s27, s27, 16
+; SI-NEXT: s_and_b32 s28, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s29, s29, 16
+; SI-NEXT: s_and_b32 s40, s41, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s41, 16
+; SI-NEXT: s_and_b32 s42, s43, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s43, 16
+; SI-NEXT: s_and_b32 s44, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s5, 16
+; SI-NEXT: s_and_b32 s46, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s4, 16
; SI-NEXT: .LBB65_3: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s77
-; SI-NEXT: v_mov_b32_e32 v1, s76
-; SI-NEXT: v_mov_b32_e32 v2, s75
-; SI-NEXT: v_mov_b32_e32 v3, s74
-; SI-NEXT: v_mov_b32_e32 v4, s73
-; SI-NEXT: v_mov_b32_e32 v5, s72
-; SI-NEXT: v_mov_b32_e32 v6, s63
-; SI-NEXT: v_mov_b32_e32 v7, s62
-; SI-NEXT: v_mov_b32_e32 v8, s61
-; SI-NEXT: v_mov_b32_e32 v9, s60
-; SI-NEXT: v_mov_b32_e32 v10, s59
-; SI-NEXT: v_mov_b32_e32 v11, s58
-; SI-NEXT: v_mov_b32_e32 v12, s57
-; SI-NEXT: v_mov_b32_e32 v13, s56
-; SI-NEXT: v_mov_b32_e32 v14, s47
-; SI-NEXT: v_mov_b32_e32 v15, s46
-; SI-NEXT: v_mov_b32_e32 v16, s45
-; SI-NEXT: v_mov_b32_e32 v17, s44
-; SI-NEXT: v_mov_b32_e32 v18, s43
-; SI-NEXT: v_mov_b32_e32 v19, s42
-; SI-NEXT: v_mov_b32_e32 v20, s41
-; SI-NEXT: v_mov_b32_e32 v21, s40
+; SI-NEXT: v_mov_b32_e32 v0, s47
+; SI-NEXT: v_mov_b32_e32 v1, s46
+; SI-NEXT: v_mov_b32_e32 v2, s45
+; SI-NEXT: v_mov_b32_e32 v3, s44
+; SI-NEXT: v_mov_b32_e32 v4, s43
+; SI-NEXT: v_mov_b32_e32 v5, s42
+; SI-NEXT: v_mov_b32_e32 v6, s41
+; SI-NEXT: v_mov_b32_e32 v7, s40
+; SI-NEXT: v_mov_b32_e32 v8, s29
+; SI-NEXT: v_mov_b32_e32 v9, s28
+; SI-NEXT: v_mov_b32_e32 v10, s27
+; SI-NEXT: v_mov_b32_e32 v11, s26
+; SI-NEXT: v_mov_b32_e32 v12, s25
+; SI-NEXT: v_mov_b32_e32 v13, s24
+; SI-NEXT: v_mov_b32_e32 v14, s23
+; SI-NEXT: v_mov_b32_e32 v15, s22
+; SI-NEXT: v_mov_b32_e32 v16, s21
+; SI-NEXT: v_mov_b32_e32 v17, s20
+; SI-NEXT: v_mov_b32_e32 v18, s19
+; SI-NEXT: v_mov_b32_e32 v19, s18
+; SI-NEXT: v_mov_b32_e32 v20, s17
+; SI-NEXT: v_mov_b32_e32 v21, s16
; SI-NEXT: v_mov_b32_e32 v22, s15
; SI-NEXT: v_mov_b32_e32 v23, s14
; SI-NEXT: v_mov_b32_e32 v24, s13
@@ -34189,20 +34593,6 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a,
; SI-NEXT: v_mov_b32_e32 v31, s6
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB65_4:
-; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr47
; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $sgpr45
@@ -34211,6 +34601,20 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr29
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr27
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr25
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr21
+; SI-NEXT: ; implicit-def: $sgpr20
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr15
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr13
@@ -35773,172 +36177,209 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16
+; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28
; SI-NEXT: s_cbranch_scc0 .LBB67_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41
+; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61
+; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59
+; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58
+; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57
+; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56
+; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47
+; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46
+; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45
+; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44
+; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
+; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42
+; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16
+; SI-NEXT: v_mov_b32_e32 v18, v20
; SI-NEXT: s_cbranch_execnz .LBB67_3
; SI-NEXT: .LBB67_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
; SI-NEXT: .LBB67_3: ; %end
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB67_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -35947,687 +36388,665 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v19, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v19, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v10, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v7, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v9, s25
+; VI-NEXT: v_mov_b32_e32 v11, s27
+; VI-NEXT: v_mov_b32_e32 v13, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB67_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB67_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB67_4
+; VI-NEXT: s_cbranch_execnz .LBB67_3
; VI-NEXT: .LBB67_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v0
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v0
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19]
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20]
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20]
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21]
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22]
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22]
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23]
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24]
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24]
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
-; VI-NEXT: s_branch .LBB67_5
-; VI-NEXT: .LBB67_3:
-; VI-NEXT: s_branch .LBB67_2
-; VI-NEXT: .LBB67_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB67_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v19, 1
-; VI-NEXT: v_readlane_b32 s30, v19, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25]
+; VI-NEXT: v_mov_b32_e32 v1, v23
+; VI-NEXT: v_mov_b32_e32 v3, v22
+; VI-NEXT: v_mov_b32_e32 v5, v21
+; VI-NEXT: v_mov_b32_e32 v7, v20
+; VI-NEXT: v_mov_b32_e32 v9, v19
+; VI-NEXT: v_mov_b32_e32 v11, v18
+; VI-NEXT: v_mov_b32_e32 v13, v17
+; VI-NEXT: v_mov_b32_e32 v15, v16
+; VI-NEXT: .LBB67_3: ; %end
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB67_4:
+; VI-NEXT: s_branch .LBB67_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v8i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s16
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-NEXT: v_mov_b32_e32 v4, s20
+; GFX9-NEXT: v_mov_b32_e32 v5, s21
+; GFX9-NEXT: v_mov_b32_e32 v6, s22
+; GFX9-NEXT: v_mov_b32_e32 v7, s23
+; GFX9-NEXT: v_mov_b32_e32 v8, s24
+; GFX9-NEXT: v_mov_b32_e32 v9, s25
+; GFX9-NEXT: v_mov_b32_e32 v10, s26
+; GFX9-NEXT: v_mov_b32_e32 v11, s27
+; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX9-NEXT: v_mov_b32_e32 v13, s29
+; GFX9-NEXT: s_cbranch_scc0 .LBB67_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB67_4
+; GFX9-NEXT: s_cbranch_execnz .LBB67_3
; GFX9-NEXT: .LBB67_2: ; %cmp.true
-; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s4, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s4, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v16, v16, v15
+; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc
; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16
-; GFX9-NEXT: s_branch .LBB67_5
-; GFX9-NEXT: .LBB67_3:
-; GFX9-NEXT: s_branch .LBB67_2
-; GFX9-NEXT: .LBB67_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: v_mov_b32_e32 v4, s20
-; GFX9-NEXT: v_mov_b32_e32 v5, s21
-; GFX9-NEXT: v_mov_b32_e32 v6, s22
-; GFX9-NEXT: v_mov_b32_e32 v7, s23
-; GFX9-NEXT: v_mov_b32_e32 v8, s24
-; GFX9-NEXT: v_mov_b32_e32 v9, s25
-; GFX9-NEXT: v_mov_b32_e32 v10, s26
-; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: v_mov_b32_e32 v12, s28
-; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB67_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0
+; GFX9-NEXT: .LBB67_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB67_4:
+; GFX9-NEXT: s_branch .LBB67_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -37702,22 +38121,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -37753,6 +38156,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -38035,22 +38454,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -38084,6 +38487,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -38355,25 +38774,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -38400,6 +38803,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -38992,345 +39411,373 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s30, 0
-; SI-NEXT: v_writelane_b32 v4, s31, 1
-; SI-NEXT: v_writelane_b32 v4, s34, 2
-; SI-NEXT: v_writelane_b32 v4, s35, 3
-; SI-NEXT: v_writelane_b32 v4, s36, 4
-; SI-NEXT: v_writelane_b32 v4, s37, 5
-; SI-NEXT: v_writelane_b32 v4, s38, 6
-; SI-NEXT: v_writelane_b32 v4, s39, 7
-; SI-NEXT: v_writelane_b32 v4, s48, 8
-; SI-NEXT: v_writelane_b32 v4, s49, 9
-; SI-NEXT: v_writelane_b32 v4, s50, 10
-; SI-NEXT: v_writelane_b32 v4, s51, 11
-; SI-NEXT: v_writelane_b32 v4, s52, 12
-; SI-NEXT: v_writelane_b32 v4, s53, 13
-; SI-NEXT: v_writelane_b32 v4, s54, 14
-; SI-NEXT: v_writelane_b32 v4, s55, 15
-; SI-NEXT: v_writelane_b32 v4, s64, 16
-; SI-NEXT: v_writelane_b32 v4, s65, 17
-; SI-NEXT: v_writelane_b32 v4, s66, 18
-; SI-NEXT: v_writelane_b32 v4, s67, 19
-; SI-NEXT: v_writelane_b32 v4, s68, 20
-; SI-NEXT: v_writelane_b32 v4, s69, 21
-; SI-NEXT: v_writelane_b32 v4, s70, 22
-; SI-NEXT: v_writelane_b32 v4, s71, 23
-; SI-NEXT: v_writelane_b32 v4, s80, 24
-; SI-NEXT: v_writelane_b32 v4, s81, 25
-; SI-NEXT: v_writelane_b32 v4, s82, 26
-; SI-NEXT: v_writelane_b32 v4, s83, 27
+; SI-NEXT: v_writelane_b32 v18, s30, 0
+; SI-NEXT: v_writelane_b32 v18, s31, 1
+; SI-NEXT: v_writelane_b32 v18, s34, 2
+; SI-NEXT: v_writelane_b32 v18, s35, 3
+; SI-NEXT: v_writelane_b32 v18, s36, 4
+; SI-NEXT: v_writelane_b32 v18, s37, 5
+; SI-NEXT: v_writelane_b32 v18, s38, 6
+; SI-NEXT: v_writelane_b32 v18, s39, 7
+; SI-NEXT: v_writelane_b32 v18, s48, 8
+; SI-NEXT: v_writelane_b32 v18, s49, 9
+; SI-NEXT: v_writelane_b32 v18, s50, 10
+; SI-NEXT: v_writelane_b32 v18, s51, 11
+; SI-NEXT: v_writelane_b32 v18, s52, 12
+; SI-NEXT: v_writelane_b32 v18, s53, 13
+; SI-NEXT: v_writelane_b32 v18, s54, 14
+; SI-NEXT: v_writelane_b32 v18, s55, 15
+; SI-NEXT: v_writelane_b32 v18, s64, 16
+; SI-NEXT: v_writelane_b32 v18, s65, 17
+; SI-NEXT: v_writelane_b32 v18, s66, 18
+; SI-NEXT: v_writelane_b32 v18, s67, 19
+; SI-NEXT: v_writelane_b32 v18, s68, 20
+; SI-NEXT: v_writelane_b32 v18, s69, 21
+; SI-NEXT: v_writelane_b32 v18, s70, 22
+; SI-NEXT: v_writelane_b32 v18, s71, 23
+; SI-NEXT: v_writelane_b32 v18, s80, 24
+; SI-NEXT: v_writelane_b32 v18, s81, 25
+; SI-NEXT: v_writelane_b32 v18, s82, 26
+; SI-NEXT: v_writelane_b32 v18, s83, 27
+; SI-NEXT: v_mov_b32_e32 v4, s16
+; SI-NEXT: v_mov_b32_e32 v5, s17
+; SI-NEXT: v_mov_b32_e32 v6, s18
+; SI-NEXT: v_mov_b32_e32 v7, s19
+; SI-NEXT: v_mov_b32_e32 v8, s20
+; SI-NEXT: v_mov_b32_e32 v9, s21
+; SI-NEXT: v_mov_b32_e32 v10, s22
+; SI-NEXT: v_mov_b32_e32 v11, s23
+; SI-NEXT: v_mov_b32_e32 v12, s24
+; SI-NEXT: v_mov_b32_e32 v13, s25
+; SI-NEXT: v_mov_b32_e32 v14, s26
+; SI-NEXT: v_mov_b32_e32 v15, s27
+; SI-NEXT: v_mov_b32_e32 v16, s28
+; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v4, s84, 28
+; SI-NEXT: v_writelane_b32 v18, s84, 28
+; SI-NEXT: v_readfirstlane_b32 s18, v4
+; SI-NEXT: v_readfirstlane_b32 s19, v5
+; SI-NEXT: v_readfirstlane_b32 s16, v6
+; SI-NEXT: v_readfirstlane_b32 s17, v7
+; SI-NEXT: v_readfirstlane_b32 s14, v8
+; SI-NEXT: v_readfirstlane_b32 s15, v9
+; SI-NEXT: v_readfirstlane_b32 s12, v10
+; SI-NEXT: v_readfirstlane_b32 s13, v11
+; SI-NEXT: v_readfirstlane_b32 s10, v12
+; SI-NEXT: v_readfirstlane_b32 s11, v13
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s9, v15
+; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-NEXT: s_and_b64 s[20:21], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
-; SI-NEXT: v_writelane_b32 v4, s85, 29
+; SI-NEXT: v_writelane_b32 v18, s85, 29
; SI-NEXT: s_cbranch_scc0 .LBB69_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
; SI-NEXT: s_lshr_b32 s39, s5, 16
; SI-NEXT: s_lshr_b32 s48, s5, 8
-; SI-NEXT: s_lshr_b32 s49, s29, 24
-; SI-NEXT: s_lshr_b32 s50, s29, 16
-; SI-NEXT: s_lshr_b32 s51, s29, 8
-; SI-NEXT: s_lshr_b32 s52, s27, 24
-; SI-NEXT: s_lshr_b32 s53, s27, 16
-; SI-NEXT: s_lshr_b32 s54, s27, 8
-; SI-NEXT: s_lshr_b32 s55, s25, 24
-; SI-NEXT: s_lshr_b32 s64, s25, 16
-; SI-NEXT: s_lshr_b32 s65, s25, 8
-; SI-NEXT: s_lshr_b32 s66, s23, 24
-; SI-NEXT: s_lshr_b32 s67, s23, 16
-; SI-NEXT: s_lshr_b32 s68, s23, 8
-; SI-NEXT: s_lshr_b32 s69, s21, 24
-; SI-NEXT: s_lshr_b32 s70, s21, 16
-; SI-NEXT: s_lshr_b32 s71, s21, 8
-; SI-NEXT: s_lshr_b32 s80, s19, 24
-; SI-NEXT: s_lshr_b32 s81, s19, 16
-; SI-NEXT: s_lshr_b32 s82, s19, 8
-; SI-NEXT: s_lshr_b32 s83, s17, 24
-; SI-NEXT: s_lshr_b32 s84, s17, 16
-; SI-NEXT: s_lshr_b32 s85, s17, 8
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8
-; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s49, s7, 24
+; SI-NEXT: s_lshr_b32 s50, s7, 16
+; SI-NEXT: s_lshr_b32 s51, s7, 8
+; SI-NEXT: s_lshr_b32 s52, s9, 24
+; SI-NEXT: s_lshr_b32 s53, s9, 16
+; SI-NEXT: s_lshr_b32 s54, s9, 8
+; SI-NEXT: s_lshr_b32 s55, s11, 24
+; SI-NEXT: s_lshr_b32 s64, s11, 16
+; SI-NEXT: s_lshr_b32 s65, s11, 8
+; SI-NEXT: s_lshr_b32 s66, s13, 24
+; SI-NEXT: s_lshr_b32 s67, s13, 16
+; SI-NEXT: s_lshr_b32 s68, s13, 8
+; SI-NEXT: s_lshr_b32 s69, s15, 24
+; SI-NEXT: s_lshr_b32 s70, s15, 16
+; SI-NEXT: s_lshr_b32 s71, s15, 8
+; SI-NEXT: s_lshr_b32 s80, s17, 24
+; SI-NEXT: s_lshr_b32 s81, s17, 16
+; SI-NEXT: s_lshr_b32 s82, s17, 8
+; SI-NEXT: s_lshr_b32 s83, s19, 24
+; SI-NEXT: s_lshr_b32 s84, s19, 16
+; SI-NEXT: s_lshr_b32 s85, s19, 8
+; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16
+; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8
+; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24
+; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
+; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24
+; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8
; SI-NEXT: s_cbranch_execnz .LBB69_3
; SI-NEXT: .LBB69_2: ; %cmp.true
; SI-NEXT: s_add_u32 s4, s4, 3
; SI-NEXT: s_addc_u32 s5, s5, 0
-; SI-NEXT: s_add_u32 s28, s28, 3
-; SI-NEXT: s_addc_u32 s29, s29, 0
-; SI-NEXT: s_add_u32 s26, s26, 3
-; SI-NEXT: s_addc_u32 s27, s27, 0
-; SI-NEXT: s_add_u32 s24, s24, 3
-; SI-NEXT: s_addc_u32 s25, s25, 0
-; SI-NEXT: s_add_u32 s22, s22, 3
-; SI-NEXT: s_addc_u32 s23, s23, 0
-; SI-NEXT: s_add_u32 s20, s20, 3
-; SI-NEXT: s_addc_u32 s21, s21, 0
-; SI-NEXT: s_add_u32 s18, s18, 3
-; SI-NEXT: s_addc_u32 s19, s19, 0
+; SI-NEXT: s_add_u32 s6, s6, 3
+; SI-NEXT: s_addc_u32 s7, s7, 0
+; SI-NEXT: s_add_u32 s8, s8, 3
+; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: s_add_u32 s10, s10, 3
+; SI-NEXT: s_addc_u32 s11, s11, 0
+; SI-NEXT: s_add_u32 s12, s12, 3
+; SI-NEXT: s_addc_u32 s13, s13, 0
+; SI-NEXT: s_add_u32 s14, s14, 3
+; SI-NEXT: s_addc_u32 s15, s15, 0
; SI-NEXT: s_add_u32 s16, s16, 3
; SI-NEXT: s_addc_u32 s17, s17, 0
+; SI-NEXT: s_add_u32 s18, s18, 3
+; SI-NEXT: s_addc_u32 s19, s19, 0
; SI-NEXT: s_lshr_b32 s38, s5, 24
; SI-NEXT: s_lshr_b32 s39, s5, 16
; SI-NEXT: s_lshr_b32 s48, s5, 8
-; SI-NEXT: s_lshr_b32 s49, s29, 24
-; SI-NEXT: s_lshr_b32 s50, s29, 16
-; SI-NEXT: s_lshr_b32 s51, s29, 8
-; SI-NEXT: s_lshr_b32 s52, s27, 24
-; SI-NEXT: s_lshr_b32 s53, s27, 16
-; SI-NEXT: s_lshr_b32 s54, s27, 8
-; SI-NEXT: s_lshr_b32 s55, s25, 24
-; SI-NEXT: s_lshr_b32 s64, s25, 16
-; SI-NEXT: s_lshr_b32 s65, s25, 8
-; SI-NEXT: s_lshr_b32 s66, s23, 24
-; SI-NEXT: s_lshr_b32 s67, s23, 16
-; SI-NEXT: s_lshr_b32 s68, s23, 8
-; SI-NEXT: s_lshr_b32 s69, s21, 24
-; SI-NEXT: s_lshr_b32 s70, s21, 16
-; SI-NEXT: s_lshr_b32 s71, s21, 8
-; SI-NEXT: s_lshr_b32 s80, s19, 24
-; SI-NEXT: s_lshr_b32 s81, s19, 16
-; SI-NEXT: s_lshr_b32 s82, s19, 8
-; SI-NEXT: s_lshr_b32 s83, s17, 24
-; SI-NEXT: s_lshr_b32 s84, s17, 16
-; SI-NEXT: s_lshr_b32 s85, s17, 8
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8
-; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s49, s7, 24
+; SI-NEXT: s_lshr_b32 s50, s7, 16
+; SI-NEXT: s_lshr_b32 s51, s7, 8
+; SI-NEXT: s_lshr_b32 s52, s9, 24
+; SI-NEXT: s_lshr_b32 s53, s9, 16
+; SI-NEXT: s_lshr_b32 s54, s9, 8
+; SI-NEXT: s_lshr_b32 s55, s11, 24
+; SI-NEXT: s_lshr_b32 s64, s11, 16
+; SI-NEXT: s_lshr_b32 s65, s11, 8
+; SI-NEXT: s_lshr_b32 s66, s13, 24
+; SI-NEXT: s_lshr_b32 s67, s13, 16
+; SI-NEXT: s_lshr_b32 s68, s13, 8
+; SI-NEXT: s_lshr_b32 s69, s15, 24
+; SI-NEXT: s_lshr_b32 s70, s15, 16
+; SI-NEXT: s_lshr_b32 s71, s15, 8
+; SI-NEXT: s_lshr_b32 s80, s17, 24
+; SI-NEXT: s_lshr_b32 s81, s17, 16
+; SI-NEXT: s_lshr_b32 s82, s17, 8
+; SI-NEXT: s_lshr_b32 s83, s19, 24
+; SI-NEXT: s_lshr_b32 s84, s19, 16
+; SI-NEXT: s_lshr_b32 s85, s19, 8
+; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16
+; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8
+; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24
+; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
+; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24
+; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8
; SI-NEXT: .LBB69_3: ; %end
-; SI-NEXT: s_lshl_b32 s7, s36, 8
-; SI-NEXT: s_and_b32 s9, s16, 0xff
-; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: s_and_b32 s9, s34, 0xff
-; SI-NEXT: s_lshl_b32 s11, s30, 24
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: s_and_b32 s7, s17, 0xff
-; SI-NEXT: s_lshl_b32 s9, s85, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s84, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s83, 24
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_lshl_b32 s7, s94, 8
-; SI-NEXT: s_and_b32 s9, s18, 0xff
-; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: s_and_b32 s9, s92, 0xff
-; SI-NEXT: s_lshl_b32 s11, s90, 24
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: s_and_b32 s7, s19, 0xff
-; SI-NEXT: s_lshl_b32 s9, s82, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s81, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s80, 24
-; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_lshl_b32 s21, s36, 8
+; SI-NEXT: s_and_b32 s18, s18, 0xff
+; SI-NEXT: s_or_b32 s18, s18, s21
+; SI-NEXT: s_and_b32 s21, s34, 0xff
+; SI-NEXT: s_lshl_b32 s23, s30, 24
+; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_or_b32 s21, s23, s21
+; SI-NEXT: s_and_b32 s18, s18, 0xffff
+; SI-NEXT: s_or_b32 s18, s18, s21
+; SI-NEXT: v_mov_b32_e32 v1, s18
+; SI-NEXT: s_and_b32 s18, s19, 0xff
+; SI-NEXT: s_lshl_b32 s19, s85, 8
+; SI-NEXT: s_or_b32 s18, s18, s19
+; SI-NEXT: s_and_b32 s19, s84, 0xff
+; SI-NEXT: s_lshl_b32 s19, s19, 16
+; SI-NEXT: s_lshl_b32 s21, s83, 24
+; SI-NEXT: s_or_b32 s19, s21, s19
+; SI-NEXT: s_and_b32 s18, s18, 0xffff
+; SI-NEXT: s_or_b32 s18, s18, s19
+; SI-NEXT: v_mov_b32_e32 v2, s18
+; SI-NEXT: s_lshl_b32 s18, s94, 8
+; SI-NEXT: s_and_b32 s16, s16, 0xff
+; SI-NEXT: s_or_b32 s16, s16, s18
+; SI-NEXT: s_and_b32 s18, s92, 0xff
+; SI-NEXT: s_lshl_b32 s19, s90, 24
+; SI-NEXT: s_lshl_b32 s18, s18, 16
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: s_or_b32 s16, s16, s18
+; SI-NEXT: v_mov_b32_e32 v3, s16
+; SI-NEXT: s_and_b32 s16, s17, 0xff
+; SI-NEXT: s_lshl_b32 s17, s82, 8
+; SI-NEXT: s_or_b32 s16, s16, s17
+; SI-NEXT: s_and_b32 s17, s81, 0xff
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_lshl_b32 s18, s80, 24
+; SI-NEXT: s_or_b32 s17, s18, s17
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s20, 0xff
-; SI-NEXT: s_lshl_b32 s9, s76, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s72, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s62, 24
+; SI-NEXT: v_mov_b32_e32 v2, s16
+; SI-NEXT: s_and_b32 s14, s14, 0xff
+; SI-NEXT: s_lshl_b32 s16, s76, 8
+; SI-NEXT: s_or_b32 s14, s14, s16
+; SI-NEXT: s_and_b32 s16, s72, 0xff
+; SI-NEXT: s_lshl_b32 s16, s16, 16
+; SI-NEXT: s_lshl_b32 s17, s62, 24
; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_and_b32 s14, s14, 0xffff
+; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s21, 0xff
-; SI-NEXT: s_lshl_b32 s9, s71, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s70, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s69, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s14
+; SI-NEXT: s_and_b32 s14, s15, 0xff
+; SI-NEXT: s_lshl_b32 s15, s71, 8
+; SI-NEXT: s_or_b32 s14, s14, s15
+; SI-NEXT: s_and_b32 s15, s70, 0xff
+; SI-NEXT: s_lshl_b32 s15, s15, 16
+; SI-NEXT: s_lshl_b32 s16, s69, 24
+; SI-NEXT: s_and_b32 s14, s14, 0xffff
+; SI-NEXT: s_or_b32 s15, s16, s15
; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s22, 0xff
-; SI-NEXT: s_lshl_b32 s9, s88, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s78, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s74, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s14
+; SI-NEXT: s_and_b32 s12, s12, 0xff
+; SI-NEXT: s_lshl_b32 s14, s88, 8
+; SI-NEXT: s_or_b32 s12, s12, s14
+; SI-NEXT: s_and_b32 s14, s78, 0xff
+; SI-NEXT: s_lshl_b32 s14, s14, 16
+; SI-NEXT: s_lshl_b32 s15, s74, 24
+; SI-NEXT: s_and_b32 s12, s12, 0xffff
+; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s12, s12, s14
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s23, 0xff
-; SI-NEXT: s_lshl_b32 s9, s68, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s67, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s66, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s12
+; SI-NEXT: s_and_b32 s12, s13, 0xff
+; SI-NEXT: s_lshl_b32 s13, s68, 8
+; SI-NEXT: s_or_b32 s12, s12, s13
+; SI-NEXT: s_and_b32 s13, s67, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s14, s66, 24
+; SI-NEXT: s_and_b32 s12, s12, 0xffff
+; SI-NEXT: s_or_b32 s13, s14, s13
; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s24, 0xff
-; SI-NEXT: s_lshl_b32 s9, s60, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s58, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s56, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s12
+; SI-NEXT: s_and_b32 s10, s10, 0xff
+; SI-NEXT: s_lshl_b32 s12, s60, 8
+; SI-NEXT: s_or_b32 s10, s10, s12
+; SI-NEXT: s_and_b32 s12, s58, 0xff
+; SI-NEXT: s_lshl_b32 s12, s12, 16
+; SI-NEXT: s_lshl_b32 s13, s56, 24
+; SI-NEXT: s_and_b32 s10, s10, 0xffff
+; SI-NEXT: s_or_b32 s12, s13, s12
; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s10, s10, s12
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s25, 0xff
-; SI-NEXT: s_lshl_b32 s9, s65, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s64, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s55, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: s_and_b32 s10, s11, 0xff
+; SI-NEXT: s_lshl_b32 s11, s65, 8
+; SI-NEXT: s_or_b32 s10, s10, s11
+; SI-NEXT: s_and_b32 s11, s64, 0xff
+; SI-NEXT: s_lshl_b32 s11, s11, 16
+; SI-NEXT: s_lshl_b32 s12, s55, 24
+; SI-NEXT: s_and_b32 s10, s10, 0xffff
+; SI-NEXT: s_or_b32 s11, s12, s11
; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s26, 0xff
-; SI-NEXT: s_lshl_b32 s9, s46, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s44, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: s_and_b32 s8, s8, 0xff
+; SI-NEXT: s_lshl_b32 s10, s46, 8
+; SI-NEXT: s_or_b32 s8, s8, s10
+; SI-NEXT: s_and_b32 s10, s44, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 16
; SI-NEXT: s_lshl_b32 s11, s42, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_and_b32 s8, s8, 0xffff
+; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s10
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s27, 0xff
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: s_and_b32 s8, s9, 0xff
; SI-NEXT: s_lshl_b32 s9, s54, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s53, 0xff
; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s52, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: s_lshl_b32 s10, s52, 24
+; SI-NEXT: s_and_b32 s8, s8, 0xffff
+; SI-NEXT: s_or_b32 s9, s10, s9
; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s28, 0xff
-; SI-NEXT: s_lshl_b32 s9, s40, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s14, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s12, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: s_and_b32 s6, s6, 0xff
+; SI-NEXT: s_lshl_b32 s8, s40, 8
+; SI-NEXT: s_or_b32 s6, s6, s8
+; SI-NEXT: s_and_b32 s8, s28, 0xff
+; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_lshl_b32 s9, s26, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s8, s9, s8
; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s6, s6, s8
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
-; SI-NEXT: s_and_b32 s7, s29, 0xff
-; SI-NEXT: s_lshl_b32 s9, s51, 8
-; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: s_and_b32 s9, s50, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 16
-; SI-NEXT: s_lshl_b32 s11, s49, 24
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_or_b32 s9, s11, s9
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_and_b32 s6, s7, 0xff
+; SI-NEXT: s_lshl_b32 s7, s51, 8
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_and_b32 s7, s50, 0xff
+; SI-NEXT: s_lshl_b32 s7, s7, 16
+; SI-NEXT: s_lshl_b32 s8, s49, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s7, s8, s7
; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s7, s10, 8
-; SI-NEXT: s_or_b32 s4, s4, s7
-; SI-NEXT: s_and_b32 s7, s8, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 16
-; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: s_lshl_b32 s6, s24, 8
+; SI-NEXT: s_or_b32 s4, s4, s6
+; SI-NEXT: s_and_b32 s6, s22, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: s_lshl_b32 s7, s20, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_or_b32 s6, s7, s6
; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -39350,38 +39797,38 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s85, v4, 29
-; SI-NEXT: v_readlane_b32 s84, v4, 28
-; SI-NEXT: v_readlane_b32 s83, v4, 27
-; SI-NEXT: v_readlane_b32 s82, v4, 26
-; SI-NEXT: v_readlane_b32 s81, v4, 25
-; SI-NEXT: v_readlane_b32 s80, v4, 24
-; SI-NEXT: v_readlane_b32 s71, v4, 23
-; SI-NEXT: v_readlane_b32 s70, v4, 22
-; SI-NEXT: v_readlane_b32 s69, v4, 21
-; SI-NEXT: v_readlane_b32 s68, v4, 20
-; SI-NEXT: v_readlane_b32 s67, v4, 19
-; SI-NEXT: v_readlane_b32 s66, v4, 18
-; SI-NEXT: v_readlane_b32 s65, v4, 17
-; SI-NEXT: v_readlane_b32 s64, v4, 16
-; SI-NEXT: v_readlane_b32 s55, v4, 15
-; SI-NEXT: v_readlane_b32 s54, v4, 14
-; SI-NEXT: v_readlane_b32 s53, v4, 13
-; SI-NEXT: v_readlane_b32 s52, v4, 12
-; SI-NEXT: v_readlane_b32 s51, v4, 11
-; SI-NEXT: v_readlane_b32 s50, v4, 10
-; SI-NEXT: v_readlane_b32 s49, v4, 9
-; SI-NEXT: v_readlane_b32 s48, v4, 8
-; SI-NEXT: v_readlane_b32 s39, v4, 7
-; SI-NEXT: v_readlane_b32 s38, v4, 6
-; SI-NEXT: v_readlane_b32 s37, v4, 5
-; SI-NEXT: v_readlane_b32 s36, v4, 4
-; SI-NEXT: v_readlane_b32 s35, v4, 3
-; SI-NEXT: v_readlane_b32 s34, v4, 2
-; SI-NEXT: v_readlane_b32 s31, v4, 1
-; SI-NEXT: v_readlane_b32 s30, v4, 0
+; SI-NEXT: v_readlane_b32 s85, v18, 29
+; SI-NEXT: v_readlane_b32 s84, v18, 28
+; SI-NEXT: v_readlane_b32 s83, v18, 27
+; SI-NEXT: v_readlane_b32 s82, v18, 26
+; SI-NEXT: v_readlane_b32 s81, v18, 25
+; SI-NEXT: v_readlane_b32 s80, v18, 24
+; SI-NEXT: v_readlane_b32 s71, v18, 23
+; SI-NEXT: v_readlane_b32 s70, v18, 22
+; SI-NEXT: v_readlane_b32 s69, v18, 21
+; SI-NEXT: v_readlane_b32 s68, v18, 20
+; SI-NEXT: v_readlane_b32 s67, v18, 19
+; SI-NEXT: v_readlane_b32 s66, v18, 18
+; SI-NEXT: v_readlane_b32 s65, v18, 17
+; SI-NEXT: v_readlane_b32 s64, v18, 16
+; SI-NEXT: v_readlane_b32 s55, v18, 15
+; SI-NEXT: v_readlane_b32 s54, v18, 14
+; SI-NEXT: v_readlane_b32 s53, v18, 13
+; SI-NEXT: v_readlane_b32 s52, v18, 12
+; SI-NEXT: v_readlane_b32 s51, v18, 11
+; SI-NEXT: v_readlane_b32 s50, v18, 10
+; SI-NEXT: v_readlane_b32 s49, v18, 9
+; SI-NEXT: v_readlane_b32 s48, v18, 8
+; SI-NEXT: v_readlane_b32 s39, v18, 7
+; SI-NEXT: v_readlane_b32 s38, v18, 6
+; SI-NEXT: v_readlane_b32 s37, v18, 5
+; SI-NEXT: v_readlane_b32 s36, v18, 4
+; SI-NEXT: v_readlane_b32 s35, v18, 3
+; SI-NEXT: v_readlane_b32 s34, v18, 2
+; SI-NEXT: v_readlane_b32 s31, v18, 1
+; SI-NEXT: v_readlane_b32 s30, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -39429,43 +39876,71 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr20
; SI-NEXT: s_branch .LBB69_2
;
; VI-LABEL: bitcast_v8i64_to_v64i8_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v4, s30, 0
-; VI-NEXT: v_writelane_b32 v4, s31, 1
-; VI-NEXT: v_writelane_b32 v4, s34, 2
-; VI-NEXT: v_writelane_b32 v4, s35, 3
-; VI-NEXT: v_writelane_b32 v4, s36, 4
-; VI-NEXT: v_writelane_b32 v4, s37, 5
-; VI-NEXT: v_writelane_b32 v4, s38, 6
-; VI-NEXT: v_writelane_b32 v4, s39, 7
-; VI-NEXT: v_writelane_b32 v4, s48, 8
-; VI-NEXT: v_writelane_b32 v4, s49, 9
-; VI-NEXT: v_writelane_b32 v4, s50, 10
-; VI-NEXT: v_writelane_b32 v4, s51, 11
-; VI-NEXT: v_writelane_b32 v4, s52, 12
-; VI-NEXT: v_writelane_b32 v4, s53, 13
-; VI-NEXT: v_writelane_b32 v4, s54, 14
-; VI-NEXT: v_writelane_b32 v4, s55, 15
-; VI-NEXT: v_writelane_b32 v4, s64, 16
-; VI-NEXT: v_writelane_b32 v4, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s30, 0
+; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_writelane_b32 v18, s34, 2
+; VI-NEXT: v_writelane_b32 v18, s35, 3
+; VI-NEXT: v_writelane_b32 v18, s36, 4
+; VI-NEXT: v_writelane_b32 v18, s37, 5
+; VI-NEXT: v_writelane_b32 v18, s38, 6
+; VI-NEXT: v_writelane_b32 v18, s39, 7
+; VI-NEXT: v_writelane_b32 v18, s48, 8
+; VI-NEXT: v_writelane_b32 v18, s49, 9
+; VI-NEXT: v_writelane_b32 v18, s50, 10
+; VI-NEXT: v_writelane_b32 v18, s51, 11
+; VI-NEXT: v_writelane_b32 v18, s52, 12
+; VI-NEXT: v_writelane_b32 v18, s53, 13
+; VI-NEXT: v_writelane_b32 v18, s54, 14
+; VI-NEXT: v_writelane_b32 v18, s55, 15
+; VI-NEXT: v_writelane_b32 v18, s64, 16
+; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v4, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v4, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s67, 19
; VI-NEXT: s_cbranch_scc0 .LBB69_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -39473,287 +39948,287 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB69_3
; VI-NEXT: .LBB69_2: ; %cmp.true
-; VI-NEXT: s_add_u32 s16, s16, 3
-; VI-NEXT: s_addc_u32 s17, s17, 0
; VI-NEXT: s_add_u32 s18, s18, 3
; VI-NEXT: s_addc_u32 s19, s19, 0
-; VI-NEXT: s_add_u32 s20, s20, 3
-; VI-NEXT: s_addc_u32 s21, s21, 0
-; VI-NEXT: s_add_u32 s22, s22, 3
-; VI-NEXT: s_addc_u32 s23, s23, 0
-; VI-NEXT: s_add_u32 s24, s24, 3
-; VI-NEXT: s_addc_u32 s25, s25, 0
-; VI-NEXT: s_add_u32 s26, s26, 3
-; VI-NEXT: s_addc_u32 s27, s27, 0
-; VI-NEXT: s_add_u32 s28, s28, 3
-; VI-NEXT: s_addc_u32 s29, s29, 0
+; VI-NEXT: s_add_u32 s16, s16, 3
+; VI-NEXT: s_addc_u32 s17, s17, 0
+; VI-NEXT: s_add_u32 s14, s14, 3
+; VI-NEXT: s_addc_u32 s15, s15, 0
+; VI-NEXT: s_add_u32 s12, s12, 3
+; VI-NEXT: s_addc_u32 s13, s13, 0
+; VI-NEXT: s_add_u32 s10, s10, 3
+; VI-NEXT: s_addc_u32 s11, s11, 0
+; VI-NEXT: s_add_u32 s8, s8, 3
+; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: s_add_u32 s6, s6, 3
+; VI-NEXT: s_addc_u32 s7, s7, 0
; VI-NEXT: s_add_u32 s4, s4, 3
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_lshr_b32 s56, s5, 24
; VI-NEXT: s_lshr_b32 s57, s5, 16
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
; VI-NEXT: .LBB69_3: ; %end
-; VI-NEXT: s_and_b32 s7, s16, 0xff
-; VI-NEXT: s_lshl_b32 s9, s67, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s66, 0xff
-; VI-NEXT: s_lshl_b32 s11, s44, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_and_b32 s7, s17, 0xff
-; VI-NEXT: s_lshl_b32 s9, s65, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s64, 0xff
-; VI-NEXT: s_lshl_b32 s11, s55, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s18, 0xff
-; VI-NEXT: s_lshl_b32 s9, s54, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s53, 0xff
-; VI-NEXT: s_lshl_b32 s11, s42, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: s_and_b32 s18, s18, 0xff
+; VI-NEXT: s_lshl_b32 s21, s67, 8
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: s_and_b32 s21, s66, 0xff
+; VI-NEXT: s_lshl_b32 s23, s44, 8
+; VI-NEXT: s_or_b32 s21, s21, s23
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s21, s21, 16
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: v_mov_b32_e32 v1, s18
+; VI-NEXT: s_and_b32 s18, s19, 0xff
+; VI-NEXT: s_lshl_b32 s19, s65, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s19, s64, 0xff
+; VI-NEXT: s_lshl_b32 s21, s55, 8
+; VI-NEXT: s_or_b32 s19, s19, s21
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s19, s19, 16
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xff
+; VI-NEXT: s_lshl_b32 s18, s54, 8
+; VI-NEXT: s_or_b32 s16, s16, s18
+; VI-NEXT: s_and_b32 s18, s53, 0xff
+; VI-NEXT: s_lshl_b32 s19, s42, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s18, s18, 16
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s18
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s19, 0xff
-; VI-NEXT: s_lshl_b32 s9, s52, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s51, 0xff
-; VI-NEXT: s_lshl_b32 s11, s50, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s16, s17, 0xff
+; VI-NEXT: s_lshl_b32 s17, s52, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s17, s51, 0xff
+; VI-NEXT: s_lshl_b32 s18, s50, 8
+; VI-NEXT: s_or_b32 s17, s17, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s17, s17, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s17
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s20, 0xff
-; VI-NEXT: s_lshl_b32 s9, s49, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s48, 0xff
-; VI-NEXT: s_lshl_b32 s11, s40, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xff
+; VI-NEXT: s_lshl_b32 s16, s49, 8
+; VI-NEXT: s_or_b32 s14, s14, s16
+; VI-NEXT: s_and_b32 s16, s48, 0xff
+; VI-NEXT: s_lshl_b32 s17, s40, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s16, s16, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s16
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s21, 0xff
-; VI-NEXT: s_lshl_b32 s9, s39, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s38, 0xff
-; VI-NEXT: s_lshl_b32 s11, s37, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s14, s15, 0xff
+; VI-NEXT: s_lshl_b32 s15, s39, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s15, s38, 0xff
+; VI-NEXT: s_lshl_b32 s16, s37, 8
+; VI-NEXT: s_or_b32 s15, s15, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s15, s15, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s15
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s22, 0xff
-; VI-NEXT: s_lshl_b32 s9, s36, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s35, 0xff
-; VI-NEXT: s_lshl_b32 s11, s14, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xff
+; VI-NEXT: s_lshl_b32 s14, s36, 8
+; VI-NEXT: s_or_b32 s12, s12, s14
+; VI-NEXT: s_and_b32 s14, s35, 0xff
+; VI-NEXT: s_lshl_b32 s15, s28, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s14, s14, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s14
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s23, 0xff
-; VI-NEXT: s_lshl_b32 s9, s34, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s31, 0xff
-; VI-NEXT: s_lshl_b32 s11, s30, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s12, s13, 0xff
+; VI-NEXT: s_lshl_b32 s13, s34, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s13, s31, 0xff
+; VI-NEXT: s_lshl_b32 s14, s30, 8
+; VI-NEXT: s_or_b32 s13, s13, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s13, s13, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s13
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s24, 0xff
-; VI-NEXT: s_lshl_b32 s9, s91, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s90, 0xff
-; VI-NEXT: s_lshl_b32 s11, s12, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xff
+; VI-NEXT: s_lshl_b32 s12, s91, 8
+; VI-NEXT: s_or_b32 s10, s10, s12
+; VI-NEXT: s_and_b32 s12, s90, 0xff
+; VI-NEXT: s_lshl_b32 s13, s26, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s12, s12, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s12
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s25, 0xff
-; VI-NEXT: s_lshl_b32 s9, s89, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s88, 0xff
-; VI-NEXT: s_lshl_b32 s11, s79, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s10, s11, 0xff
+; VI-NEXT: s_lshl_b32 s11, s89, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s11, s88, 0xff
+; VI-NEXT: s_lshl_b32 s12, s79, 8
+; VI-NEXT: s_or_b32 s11, s11, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s11, s11, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s11
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s26, 0xff
-; VI-NEXT: s_lshl_b32 s9, s78, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s77, 0xff
-; VI-NEXT: s_lshl_b32 s10, s10, 8
-; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s8, s8, 0xff
+; VI-NEXT: s_lshl_b32 s10, s78, 8
+; VI-NEXT: s_or_b32 s8, s8, s10
+; VI-NEXT: s_and_b32 s10, s77, 0xff
+; VI-NEXT: s_lshl_b32 s11, s24, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_lshl_b32 s10, s10, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s10
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s27, 0xff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s8, s9, 0xff
; VI-NEXT: s_lshl_b32 s9, s76, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: s_and_b32 s9, s75, 0xff
; VI-NEXT: s_lshl_b32 s10, s74, 8
; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_lshl_b32 s9, s9, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s28, 0xff
-; VI-NEXT: s_lshl_b32 s9, s73, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s72, 0xff
-; VI-NEXT: s_lshl_b32 s8, s8, 8
-; VI-NEXT: s_or_b32 s8, s9, s8
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xff
+; VI-NEXT: s_lshl_b32 s8, s73, 8
+; VI-NEXT: s_or_b32 s6, s6, s8
+; VI-NEXT: s_and_b32 s8, s72, 0xff
+; VI-NEXT: s_lshl_b32 s9, s22, 8
+; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
; VI-NEXT: s_lshl_b32 s8, s8, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s29, 0xff
-; VI-NEXT: s_lshl_b32 s8, s63, 8
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_and_b32 s6, s7, 0xff
+; VI-NEXT: s_lshl_b32 s7, s63, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s62, 0xff
+; VI-NEXT: s_lshl_b32 s8, s61, 8
; VI-NEXT: s_or_b32 s7, s7, s8
-; VI-NEXT: s_and_b32 s8, s62, 0xff
-; VI-NEXT: s_lshl_b32 s9, s61, 8
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_and_b32 s4, s4, 0xff
-; VI-NEXT: s_lshl_b32 s7, s60, 8
-; VI-NEXT: s_or_b32 s4, s4, s7
-; VI-NEXT: s_and_b32 s7, s59, 0xff
-; VI-NEXT: s_lshl_b32 s6, s6, 8
-; VI-NEXT: s_or_b32 s6, s7, s6
+; VI-NEXT: s_lshl_b32 s6, s60, 8
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: s_and_b32 s6, s59, 0xff
+; VI-NEXT: s_lshl_b32 s7, s20, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0
@@ -39774,28 +40249,28 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v4, 19
-; VI-NEXT: v_readlane_b32 s66, v4, 18
-; VI-NEXT: v_readlane_b32 s65, v4, 17
-; VI-NEXT: v_readlane_b32 s64, v4, 16
-; VI-NEXT: v_readlane_b32 s55, v4, 15
-; VI-NEXT: v_readlane_b32 s54, v4, 14
-; VI-NEXT: v_readlane_b32 s53, v4, 13
-; VI-NEXT: v_readlane_b32 s52, v4, 12
-; VI-NEXT: v_readlane_b32 s51, v4, 11
-; VI-NEXT: v_readlane_b32 s50, v4, 10
-; VI-NEXT: v_readlane_b32 s49, v4, 9
-; VI-NEXT: v_readlane_b32 s48, v4, 8
-; VI-NEXT: v_readlane_b32 s39, v4, 7
-; VI-NEXT: v_readlane_b32 s38, v4, 6
-; VI-NEXT: v_readlane_b32 s37, v4, 5
-; VI-NEXT: v_readlane_b32 s36, v4, 4
-; VI-NEXT: v_readlane_b32 s35, v4, 3
-; VI-NEXT: v_readlane_b32 s34, v4, 2
-; VI-NEXT: v_readlane_b32 s31, v4, 1
-; VI-NEXT: v_readlane_b32 s30, v4, 0
+; VI-NEXT: v_readlane_b32 s67, v18, 19
+; VI-NEXT: v_readlane_b32 s66, v18, 18
+; VI-NEXT: v_readlane_b32 s65, v18, 17
+; VI-NEXT: v_readlane_b32 s64, v18, 16
+; VI-NEXT: v_readlane_b32 s55, v18, 15
+; VI-NEXT: v_readlane_b32 s54, v18, 14
+; VI-NEXT: v_readlane_b32 s53, v18, 13
+; VI-NEXT: v_readlane_b32 s52, v18, 12
+; VI-NEXT: v_readlane_b32 s51, v18, 11
+; VI-NEXT: v_readlane_b32 s50, v18, 10
+; VI-NEXT: v_readlane_b32 s49, v18, 9
+; VI-NEXT: v_readlane_b32 s48, v18, 8
+; VI-NEXT: v_readlane_b32 s39, v18, 7
+; VI-NEXT: v_readlane_b32 s38, v18, 6
+; VI-NEXT: v_readlane_b32 s37, v18, 5
+; VI-NEXT: v_readlane_b32 s36, v18, 4
+; VI-NEXT: v_readlane_b32 s35, v18, 3
+; VI-NEXT: v_readlane_b32 s34, v18, 2
+; VI-NEXT: v_readlane_b32 s31, v18, 1
+; VI-NEXT: v_readlane_b32 s30, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -39820,31 +40295,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr36
; VI-NEXT: ; implicit-def: $sgpr35
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr31
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr90
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr79
; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr73
; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: ; implicit-def: $sgpr62
; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: ; implicit-def: $sgpr60
; VI-NEXT: ; implicit-def: $sgpr59
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr20
; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: ; implicit-def: $sgpr57
; VI-NEXT: ; implicit-def: $sgpr56
@@ -39854,28 +40329,56 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v4, s30, 0
-; GFX9-NEXT: v_writelane_b32 v4, s31, 1
-; GFX9-NEXT: v_writelane_b32 v4, s34, 2
-; GFX9-NEXT: v_writelane_b32 v4, s35, 3
-; GFX9-NEXT: v_writelane_b32 v4, s36, 4
-; GFX9-NEXT: v_writelane_b32 v4, s37, 5
-; GFX9-NEXT: v_writelane_b32 v4, s38, 6
-; GFX9-NEXT: v_writelane_b32 v4, s39, 7
-; GFX9-NEXT: v_writelane_b32 v4, s48, 8
-; GFX9-NEXT: v_writelane_b32 v4, s49, 9
-; GFX9-NEXT: v_writelane_b32 v4, s50, 10
-; GFX9-NEXT: v_writelane_b32 v4, s51, 11
-; GFX9-NEXT: v_writelane_b32 v4, s52, 12
-; GFX9-NEXT: v_writelane_b32 v4, s53, 13
+; GFX9-NEXT: v_writelane_b32 v18, s30, 0
+; GFX9-NEXT: v_writelane_b32 v18, s31, 1
+; GFX9-NEXT: v_writelane_b32 v18, s34, 2
+; GFX9-NEXT: v_writelane_b32 v18, s35, 3
+; GFX9-NEXT: v_writelane_b32 v18, s36, 4
+; GFX9-NEXT: v_writelane_b32 v18, s37, 5
+; GFX9-NEXT: v_writelane_b32 v18, s38, 6
+; GFX9-NEXT: v_writelane_b32 v18, s39, 7
+; GFX9-NEXT: v_writelane_b32 v18, s48, 8
+; GFX9-NEXT: v_writelane_b32 v18, s49, 9
+; GFX9-NEXT: v_writelane_b32 v18, s50, 10
+; GFX9-NEXT: v_writelane_b32 v18, s51, 11
+; GFX9-NEXT: v_writelane_b32 v18, s52, 12
+; GFX9-NEXT: v_writelane_b32 v18, s53, 13
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v4, s54, 14
+; GFX9-NEXT: v_writelane_b32 v18, s54, 14
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_writelane_b32 v4, s55, 15
+; GFX9-NEXT: v_writelane_b32 v18, s55, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB69_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -39883,275 +40386,275 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: s_lshr_b32 s59, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s63, s29, 8
-; GFX9-NEXT: s_lshr_b32 s72, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s76, s27, 8
-; GFX9-NEXT: s_lshr_b32 s77, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s89, s25, 8
-; GFX9-NEXT: s_lshr_b32 s90, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s94, s23, 8
-; GFX9-NEXT: s_lshr_b32 s95, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s35, s21, 8
-; GFX9-NEXT: s_lshr_b32 s36, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s48, s19, 8
-; GFX9-NEXT: s_lshr_b32 s49, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s53, s17, 8
-; GFX9-NEXT: s_lshr_b32 s54, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s63, s7, 8
+; GFX9-NEXT: s_lshr_b32 s72, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s9, 8
+; GFX9-NEXT: s_lshr_b32 s77, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s89, s11, 8
+; GFX9-NEXT: s_lshr_b32 s90, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s94, s13, 8
+; GFX9-NEXT: s_lshr_b32 s95, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s35, s15, 8
+; GFX9-NEXT: s_lshr_b32 s36, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s48, s17, 8
+; GFX9-NEXT: s_lshr_b32 s49, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s53, s19, 8
+; GFX9-NEXT: s_lshr_b32 s54, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB69_3
; GFX9-NEXT: .LBB69_2: ; %cmp.true
-; GFX9-NEXT: s_add_u32 s16, s16, 3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
; GFX9-NEXT: s_add_u32 s18, s18, 3
; GFX9-NEXT: s_addc_u32 s19, s19, 0
-; GFX9-NEXT: s_add_u32 s20, s20, 3
-; GFX9-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-NEXT: s_add_u32 s22, s22, 3
-; GFX9-NEXT: s_addc_u32 s23, s23, 0
-; GFX9-NEXT: s_add_u32 s24, s24, 3
-; GFX9-NEXT: s_addc_u32 s25, s25, 0
-; GFX9-NEXT: s_add_u32 s26, s26, 3
-; GFX9-NEXT: s_addc_u32 s27, s27, 0
-; GFX9-NEXT: s_add_u32 s28, s28, 3
-; GFX9-NEXT: s_addc_u32 s29, s29, 0
+; GFX9-NEXT: s_add_u32 s16, s16, 3
+; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: s_add_u32 s14, s14, 3
+; GFX9-NEXT: s_addc_u32 s15, s15, 0
+; GFX9-NEXT: s_add_u32 s12, s12, 3
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: s_add_u32 s10, s10, 3
+; GFX9-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-NEXT: s_add_u32 s8, s8, 3
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_add_u32 s6, s6, 3
+; GFX9-NEXT: s_addc_u32 s7, s7, 0
; GFX9-NEXT: s_add_u32 s4, s4, 3
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
; GFX9-NEXT: s_lshr_b32 s57, s5, 16
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: s_lshr_b32 s59, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s63, s29, 8
-; GFX9-NEXT: s_lshr_b32 s72, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s76, s27, 8
-; GFX9-NEXT: s_lshr_b32 s77, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s89, s25, 8
-; GFX9-NEXT: s_lshr_b32 s90, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s94, s23, 8
-; GFX9-NEXT: s_lshr_b32 s95, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s35, s21, 8
-; GFX9-NEXT: s_lshr_b32 s36, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s48, s19, 8
-; GFX9-NEXT: s_lshr_b32 s49, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s53, s17, 8
-; GFX9-NEXT: s_lshr_b32 s54, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s63, s7, 8
+; GFX9-NEXT: s_lshr_b32 s72, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s9, 8
+; GFX9-NEXT: s_lshr_b32 s77, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s89, s11, 8
+; GFX9-NEXT: s_lshr_b32 s90, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s94, s13, 8
+; GFX9-NEXT: s_lshr_b32 s95, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s35, s15, 8
+; GFX9-NEXT: s_lshr_b32 s36, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s48, s17, 8
+; GFX9-NEXT: s_lshr_b32 s49, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s53, s19, 8
+; GFX9-NEXT: s_lshr_b32 s54, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
; GFX9-NEXT: .LBB69_3: ; %end
-; GFX9-NEXT: s_and_b32 s7, s16, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s55, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s54, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s44, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s17, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s53, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s52, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s51, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s18, s18, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s55, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: s_and_b32 s21, s54, 0xff
+; GFX9-NEXT: s_lshl_b32 s23, s44, 8
+; GFX9-NEXT: s_or_b32 s21, s21, s23
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s21, s21, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s18, s19, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s53, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s19, s52, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s51, 8
+; GFX9-NEXT: s_or_b32 s19, s19, s21
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s19, s19, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s19
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s18, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s50, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s49, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s42, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s50, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s18
+; GFX9-NEXT: s_and_b32 s18, s49, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s42, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s18, s18, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s19, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s48, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s39, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s38, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s16, s17, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s48, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s17, s39, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s38, 8
+; GFX9-NEXT: s_or_b32 s17, s17, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s17, s17, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s17
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s20, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s37, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s36, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s40, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s37, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s16
+; GFX9-NEXT: s_and_b32 s16, s36, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s40, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s16, s16, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s16
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s21, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s35, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s34, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s31, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s14, s15, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s35, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s15, s34, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s31, 8
+; GFX9-NEXT: s_or_b32 s15, s15, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s15, s15, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s15
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s22, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s30, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s95, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s14, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s30, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s14
+; GFX9-NEXT: s_and_b32 s14, s95, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s28, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s14, s14, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s14
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s23, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s94, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s93, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s92, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s12, s13, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s94, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s13, s93, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s92, 8
+; GFX9-NEXT: s_or_b32 s13, s13, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s13, s13, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s13
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s24, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s91, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s90, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s12, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s91, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s12
+; GFX9-NEXT: s_and_b32 s12, s90, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s26, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s12, s12, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s12
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s25, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s89, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s88, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s79, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s10, s11, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s89, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s11, s88, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s79, 8
+; GFX9-NEXT: s_or_b32 s11, s11, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s11, s11, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s11
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s26, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s78, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s10, s10, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s8, s8, 0xff
+; GFX9-NEXT: s_lshl_b32 s10, s78, 8
+; GFX9-NEXT: s_or_b32 s8, s8, s10
+; GFX9-NEXT: s_and_b32 s10, s77, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s24, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX9-NEXT: s_lshl_b32 s10, s10, 16
+; GFX9-NEXT: s_or_b32 s8, s8, s10
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s27, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s8, s9, 0xff
; GFX9-NEXT: s_lshl_b32 s9, s76, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: s_and_b32 s9, s75, 0xff
; GFX9-NEXT: s_lshl_b32 s10, s74, 8
; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s28, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s73, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s72, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s8, s9, s8
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s29, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s63, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: s_and_b32 s8, s62, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s61, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s73, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: s_and_b32 s8, s72, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s22, 8
; GFX9-NEXT: s_or_b32 s8, s8, s9
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s7, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s63, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s62, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s61, 8
; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s60, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s7
-; GFX9-NEXT: s_and_b32 s7, s59, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s6, s7, s6
+; GFX9-NEXT: s_lshl_b32 s6, s60, 8
+; GFX9-NEXT: s_or_b32 s4, s4, s6
+; GFX9-NEXT: s_and_b32 s6, s59, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s20, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
@@ -40169,24 +40672,24 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_readlane_b32 s55, v4, 15
-; GFX9-NEXT: v_readlane_b32 s54, v4, 14
-; GFX9-NEXT: v_readlane_b32 s53, v4, 13
-; GFX9-NEXT: v_readlane_b32 s52, v4, 12
-; GFX9-NEXT: v_readlane_b32 s51, v4, 11
-; GFX9-NEXT: v_readlane_b32 s50, v4, 10
-; GFX9-NEXT: v_readlane_b32 s49, v4, 9
-; GFX9-NEXT: v_readlane_b32 s48, v4, 8
-; GFX9-NEXT: v_readlane_b32 s39, v4, 7
-; GFX9-NEXT: v_readlane_b32 s38, v4, 6
-; GFX9-NEXT: v_readlane_b32 s37, v4, 5
-; GFX9-NEXT: v_readlane_b32 s36, v4, 4
-; GFX9-NEXT: v_readlane_b32 s35, v4, 3
-; GFX9-NEXT: v_readlane_b32 s34, v4, 2
-; GFX9-NEXT: v_readlane_b32 s31, v4, 1
-; GFX9-NEXT: v_readlane_b32 s30, v4, 0
+; GFX9-NEXT: v_readlane_b32 s55, v18, 15
+; GFX9-NEXT: v_readlane_b32 s54, v18, 14
+; GFX9-NEXT: v_readlane_b32 s53, v18, 13
+; GFX9-NEXT: v_readlane_b32 s52, v18, 12
+; GFX9-NEXT: v_readlane_b32 s51, v18, 11
+; GFX9-NEXT: v_readlane_b32 s50, v18, 10
+; GFX9-NEXT: v_readlane_b32 s49, v18, 9
+; GFX9-NEXT: v_readlane_b32 s48, v18, 8
+; GFX9-NEXT: v_readlane_b32 s39, v18, 7
+; GFX9-NEXT: v_readlane_b32 s38, v18, 6
+; GFX9-NEXT: v_readlane_b32 s37, v18, 5
+; GFX9-NEXT: v_readlane_b32 s36, v18, 4
+; GFX9-NEXT: v_readlane_b32 s35, v18, 3
+; GFX9-NEXT: v_readlane_b32 s34, v18, 2
+; GFX9-NEXT: v_readlane_b32 s31, v18, 1
+; GFX9-NEXT: v_readlane_b32 s30, v18, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -40211,31 +40714,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: ; implicit-def: $sgpr31
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr95
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr94
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr90
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr89
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
; GFX9-NEXT: ; implicit-def: $sgpr78
; GFX9-NEXT: ; implicit-def: $sgpr77
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr76
; GFX9-NEXT: ; implicit-def: $sgpr75
; GFX9-NEXT: ; implicit-def: $sgpr74
; GFX9-NEXT: ; implicit-def: $sgpr73
; GFX9-NEXT: ; implicit-def: $sgpr72
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr63
; GFX9-NEXT: ; implicit-def: $sgpr62
; GFX9-NEXT: ; implicit-def: $sgpr61
; GFX9-NEXT: ; implicit-def: $sgpr60
; GFX9-NEXT: ; implicit-def: $sgpr59
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr58
; GFX9-NEXT: ; implicit-def: $sgpr57
; GFX9-NEXT: ; implicit-def: $sgpr56
@@ -45775,111 +46278,139 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
; VI-LABEL: bitcast_v32i16_to_v8f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s6, v3
+; VI-NEXT: v_readfirstlane_b32 s7, v4
+; VI-NEXT: v_readfirstlane_b32 s8, v5
+; VI-NEXT: v_readfirstlane_b32 s9, v6
+; VI-NEXT: v_readfirstlane_b32 s10, v7
+; VI-NEXT: v_readfirstlane_b32 s11, v8
+; VI-NEXT: v_readfirstlane_b32 s12, v9
+; VI-NEXT: v_readfirstlane_b32 s13, v10
+; VI-NEXT: v_readfirstlane_b32 s14, v11
+; VI-NEXT: v_readfirstlane_b32 s15, v12
+; VI-NEXT: v_readfirstlane_b32 s16, v13
+; VI-NEXT: v_readfirstlane_b32 s17, v14
+; VI-NEXT: v_readfirstlane_b32 s18, v15
+; VI-NEXT: v_readfirstlane_b32 s19, v16
+; VI-NEXT: v_readfirstlane_b32 s20, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v1
; VI-NEXT: s_cbranch_scc0 .LBB75_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB75_3
; VI-NEXT: .LBB75_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s5, s7, 3
-; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s6, 3
-; VI-NEXT: s_add_i32 s7, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s20, 3
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s29, 3
-; VI-NEXT: s_add_i32 s6, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s19, 3
+; VI-NEXT: s_add_i32 s20, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s28, 3
-; VI-NEXT: s_add_i32 s29, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s18, 3
+; VI-NEXT: s_add_i32 s19, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s27, 3
-; VI-NEXT: s_add_i32 s28, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s17, 3
+; VI-NEXT: s_add_i32 s18, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s26, 3
-; VI-NEXT: s_add_i32 s27, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s16, 3
+; VI-NEXT: s_add_i32 s17, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s25, 3
-; VI-NEXT: s_add_i32 s26, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s15, 3
+; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s15, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s24, 3
-; VI-NEXT: s_add_i32 s25, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s14, 3
+; VI-NEXT: s_add_i32 s15, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s14, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s23, 3
-; VI-NEXT: s_add_i32 s24, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s13, 3
+; VI-NEXT: s_add_i32 s14, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s13, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s22, 3
-; VI-NEXT: s_add_i32 s23, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s12, 3
+; VI-NEXT: s_add_i32 s13, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s12, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s21, 3
-; VI-NEXT: s_add_i32 s22, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s11, 3
+; VI-NEXT: s_add_i32 s12, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s11, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s20, 3
-; VI-NEXT: s_add_i32 s21, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s10, 3
+; VI-NEXT: s_add_i32 s11, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s10, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s19, 3
-; VI-NEXT: s_add_i32 s20, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s9, 3
+; VI-NEXT: s_add_i32 s10, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s9, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s18, 3
-; VI-NEXT: s_add_i32 s19, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s8, 3
+; VI-NEXT: s_add_i32 s9, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s8, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s17, 3
-; VI-NEXT: s_add_i32 s18, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s7, 3
+; VI-NEXT: s_add_i32 s8, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s7, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_add_i32 s17, s4, 0x30000
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s6, 3
+; VI-NEXT: s_add_i32 s7, s4, 0x30000
+; VI-NEXT: s_and_b32 s4, s6, 0xffff0000
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s6, s4, 0x30000
; VI-NEXT: .LBB75_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_mov_b32_e32 v7, s13
+; VI-NEXT: v_mov_b32_e32 v8, s14
+; VI-NEXT: v_mov_b32_e32 v9, s15
+; VI-NEXT: v_mov_b32_e32 v10, s16
+; VI-NEXT: v_mov_b32_e32 v11, s17
+; VI-NEXT: v_mov_b32_e32 v12, s18
+; VI-NEXT: v_mov_b32_e32 v13, s19
+; VI-NEXT: v_mov_b32_e32 v14, s20
+; VI-NEXT: v_mov_b32_e32 v15, s21
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB75_4:
; VI-NEXT: s_branch .LBB75_2
@@ -46253,102 +46784,114 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s5, v1
+; SI-NEXT: v_mov_b32_e32 v54, s16
+; SI-NEXT: v_mov_b32_e32 v55, s17
+; SI-NEXT: v_mov_b32_e32 v52, s18
+; SI-NEXT: v_mov_b32_e32 v53, s19
+; SI-NEXT: v_mov_b32_e32 v50, s20
+; SI-NEXT: v_mov_b32_e32 v51, s21
+; SI-NEXT: v_mov_b32_e32 v48, s22
+; SI-NEXT: v_mov_b32_e32 v49, s23
+; SI-NEXT: v_mov_b32_e32 v38, s24
+; SI-NEXT: v_mov_b32_e32 v39, s25
+; SI-NEXT: v_mov_b32_e32 v36, s26
+; SI-NEXT: v_mov_b32_e32 v37, s27
+; SI-NEXT: v_mov_b32_e32 v34, s28
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mov_b32_e32 v35, s29
; SI-NEXT: s_cbranch_scc0 .LBB77_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s6, s5, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s6
-; SI-NEXT: s_lshr_b32 s6, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v29, s6
-; SI-NEXT: s_lshr_b32 s6, s29, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v27, s6
-; SI-NEXT: s_lshr_b32 s6, s28, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v25, s6
-; SI-NEXT: s_lshr_b32 s6, s27, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v23, s6
-; SI-NEXT: s_lshr_b32 s6, s26, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v21, s6
-; SI-NEXT: s_lshr_b32 s6, s25, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v19, s6
-; SI-NEXT: s_lshr_b32 s6, s24, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s6
-; SI-NEXT: s_lshr_b32 s6, s23, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v15, s6
-; SI-NEXT: s_lshr_b32 s6, s22, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s6
-; SI-NEXT: s_lshr_b32 s6, s21, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s6
-; SI-NEXT: s_lshr_b32 s6, s20, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v9, s6
-; SI-NEXT: s_lshr_b32 s6, s19, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s6
-; SI-NEXT: s_lshr_b32 s6, s18, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
-; SI-NEXT: s_lshr_b32 s6, s17, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
-; SI-NEXT: s_lshr_b32 s6, s16, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v30, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v28, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v26, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v24, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v22, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s23
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v54
; SI-NEXT: s_cbranch_execnz .LBB77_3
; SI-NEXT: .LBB77_2: ; %cmp.true
-; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0
-; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0
-; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0
-; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0
-; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0
-; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0
-; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0
-; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; SI-NEXT: v_add_f64 v[31:32], v[54:55], 1.0
+; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0
+; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0
+; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0
+; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0
+; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0
+; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0
+; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v14
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v0
; SI-NEXT: v_cvt_f32_f16_e32 v26, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v12
; SI-NEXT: v_cvt_f32_f16_e32 v22, v11
; SI-NEXT: v_cvt_f32_f16_e32 v20, v10
; SI-NEXT: v_cvt_f32_f16_e32 v18, v9
; SI-NEXT: v_cvt_f32_f16_e32 v16, v8
; SI-NEXT: v_cvt_f32_f16_e32 v14, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v6
; SI-NEXT: v_cvt_f32_f16_e32 v10, v5
; SI-NEXT: v_cvt_f32_f16_e32 v8, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v34
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
@@ -46356,19 +46899,21 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
; SI-NEXT: .LBB77_3: ; %end
+; SI-NEXT: v_mov_b32_e32 v0, v32
+; SI-NEXT: v_mov_b32_e32 v1, v33
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB77_4:
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
@@ -49226,172 +49771,209 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16
+; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28
; SI-NEXT: s_cbranch_scc0 .LBB83_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41
+; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61
+; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59
+; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58
+; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57
+; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56
+; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47
+; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46
+; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45
+; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44
+; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
+; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42
+; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16
+; SI-NEXT: v_mov_b32_e32 v18, v20
; SI-NEXT: s_cbranch_execnz .LBB83_3
; SI-NEXT: .LBB83_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
; SI-NEXT: .LBB83_3: ; %end
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB83_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -49400,687 +49982,665 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v19, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v19, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v10, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v7, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v9, s25
+; VI-NEXT: v_mov_b32_e32 v11, s27
+; VI-NEXT: v_mov_b32_e32 v13, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB83_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB83_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB83_4
+; VI-NEXT: s_cbranch_execnz .LBB83_3
; VI-NEXT: .LBB83_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v0
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v0
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19]
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20]
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20]
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21]
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22]
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22]
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23]
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24]
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24]
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v0
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
-; VI-NEXT: s_branch .LBB83_5
-; VI-NEXT: .LBB83_3:
-; VI-NEXT: s_branch .LBB83_2
-; VI-NEXT: .LBB83_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB83_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v19, 1
-; VI-NEXT: v_readlane_b32 s30, v19, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25]
+; VI-NEXT: v_mov_b32_e32 v1, v23
+; VI-NEXT: v_mov_b32_e32 v3, v22
+; VI-NEXT: v_mov_b32_e32 v5, v21
+; VI-NEXT: v_mov_b32_e32 v7, v20
+; VI-NEXT: v_mov_b32_e32 v9, v19
+; VI-NEXT: v_mov_b32_e32 v11, v18
+; VI-NEXT: v_mov_b32_e32 v13, v17
+; VI-NEXT: v_mov_b32_e32 v15, v16
+; VI-NEXT: .LBB83_3: ; %end
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB83_4:
+; VI-NEXT: s_branch .LBB83_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v8f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s16
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-NEXT: v_mov_b32_e32 v4, s20
+; GFX9-NEXT: v_mov_b32_e32 v5, s21
+; GFX9-NEXT: v_mov_b32_e32 v6, s22
+; GFX9-NEXT: v_mov_b32_e32 v7, s23
+; GFX9-NEXT: v_mov_b32_e32 v8, s24
+; GFX9-NEXT: v_mov_b32_e32 v9, s25
+; GFX9-NEXT: v_mov_b32_e32 v10, s26
+; GFX9-NEXT: v_mov_b32_e32 v11, s27
+; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX9-NEXT: v_mov_b32_e32 v13, s29
+; GFX9-NEXT: s_cbranch_scc0 .LBB83_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB83_4
+; GFX9-NEXT: s_cbranch_execnz .LBB83_3
; GFX9-NEXT: .LBB83_2: ; %cmp.true
-; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s4, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s4, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v16, v16, v15
+; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc
; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v14
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v13
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v12
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v11
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v10
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v9
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v8
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v5
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v4
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v3
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v2
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16
-; GFX9-NEXT: s_branch .LBB83_5
-; GFX9-NEXT: .LBB83_3:
-; GFX9-NEXT: s_branch .LBB83_2
-; GFX9-NEXT: .LBB83_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: v_mov_b32_e32 v4, s20
-; GFX9-NEXT: v_mov_b32_e32 v5, s21
-; GFX9-NEXT: v_mov_b32_e32 v6, s22
-; GFX9-NEXT: v_mov_b32_e32 v7, s23
-; GFX9-NEXT: v_mov_b32_e32 v8, s24
-; GFX9-NEXT: v_mov_b32_e32 v9, s25
-; GFX9-NEXT: v_mov_b32_e32 v10, s26
-; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: v_mov_b32_e32 v12, s28
-; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB83_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0
+; GFX9-NEXT: .LBB83_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB83_4:
+; GFX9-NEXT: s_branch .LBB83_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -51147,22 +51707,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -51198,6 +51742,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -51472,22 +52032,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -51521,6 +52065,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -51784,25 +52344,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -51829,6 +52373,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -52428,10 +52988,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_writelane_b32 v40, s83, 27
; SI-NEXT: v_writelane_b32 v40, s84, 28
; SI-NEXT: v_writelane_b32 v40, s85, 29
+; SI-NEXT: v_mov_b32_e32 v4, s16
+; SI-NEXT: v_mov_b32_e32 v5, s17
+; SI-NEXT: v_mov_b32_e32 v6, s18
+; SI-NEXT: v_mov_b32_e32 v7, s19
+; SI-NEXT: v_mov_b32_e32 v8, s20
+; SI-NEXT: v_mov_b32_e32 v9, s21
+; SI-NEXT: v_mov_b32_e32 v10, s22
+; SI-NEXT: v_mov_b32_e32 v11, s23
+; SI-NEXT: v_mov_b32_e32 v12, s24
+; SI-NEXT: v_mov_b32_e32 v13, s25
+; SI-NEXT: v_mov_b32_e32 v14, s26
+; SI-NEXT: v_mov_b32_e32 v15, s27
+; SI-NEXT: v_mov_b32_e32 v16, s28
+; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_writelane_b32 v40, s86, 30
+; SI-NEXT: v_readfirstlane_b32 s18, v4
+; SI-NEXT: v_readfirstlane_b32 s19, v5
+; SI-NEXT: v_readfirstlane_b32 s16, v6
+; SI-NEXT: v_readfirstlane_b32 s17, v7
+; SI-NEXT: v_readfirstlane_b32 s14, v8
+; SI-NEXT: v_readfirstlane_b32 s15, v9
+; SI-NEXT: v_readfirstlane_b32 s12, v10
+; SI-NEXT: v_readfirstlane_b32 s13, v11
+; SI-NEXT: v_readfirstlane_b32 s10, v12
+; SI-NEXT: v_readfirstlane_b32 s11, v13
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s9, v15
+; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-NEXT: s_and_b64 s[20:21], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
; SI-NEXT: v_writelane_b32 v40, s87, 31
; SI-NEXT: s_cbranch_scc0 .LBB85_3
@@ -52439,62 +53027,62 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: s_lshr_b32 s48, s5, 24
; SI-NEXT: s_lshr_b32 s49, s5, 16
; SI-NEXT: s_lshr_b32 s50, s5, 8
-; SI-NEXT: s_lshr_b32 s51, s29, 24
-; SI-NEXT: s_lshr_b32 s52, s29, 16
-; SI-NEXT: s_lshr_b32 s53, s29, 8
-; SI-NEXT: s_lshr_b32 s54, s27, 24
-; SI-NEXT: s_lshr_b32 s55, s27, 16
-; SI-NEXT: s_lshr_b32 s64, s27, 8
-; SI-NEXT: s_lshr_b32 s65, s25, 24
-; SI-NEXT: s_lshr_b32 s66, s25, 16
-; SI-NEXT: s_lshr_b32 s67, s25, 8
-; SI-NEXT: s_lshr_b32 s68, s23, 24
-; SI-NEXT: s_lshr_b32 s69, s23, 16
-; SI-NEXT: s_lshr_b32 s70, s23, 8
-; SI-NEXT: s_lshr_b32 s71, s21, 24
-; SI-NEXT: s_lshr_b32 s80, s21, 16
-; SI-NEXT: s_lshr_b32 s81, s21, 8
-; SI-NEXT: s_lshr_b32 s82, s19, 24
-; SI-NEXT: s_lshr_b32 s83, s19, 16
-; SI-NEXT: s_lshr_b32 s84, s19, 8
-; SI-NEXT: s_lshr_b32 s85, s17, 24
-; SI-NEXT: s_lshr_b32 s86, s17, 16
-; SI-NEXT: s_lshr_b32 s87, s17, 8
-; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8
-; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24
-; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16
-; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 8
-; SI-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[88:89], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 24
-; SI-NEXT: s_lshr_b64 s[94:95], s[24:25], 16
-; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 8
-; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24
-; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 16
-; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 8
-; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16
-; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 8
-; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 24
-; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16
-; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 8
+; SI-NEXT: s_lshr_b32 s51, s7, 24
+; SI-NEXT: s_lshr_b32 s52, s7, 16
+; SI-NEXT: s_lshr_b32 s53, s7, 8
+; SI-NEXT: s_lshr_b32 s54, s9, 24
+; SI-NEXT: s_lshr_b32 s55, s9, 16
+; SI-NEXT: s_lshr_b32 s64, s9, 8
+; SI-NEXT: s_lshr_b32 s65, s11, 24
+; SI-NEXT: s_lshr_b32 s66, s11, 16
+; SI-NEXT: s_lshr_b32 s67, s11, 8
+; SI-NEXT: s_lshr_b32 s68, s13, 24
+; SI-NEXT: s_lshr_b32 s69, s13, 16
+; SI-NEXT: s_lshr_b32 s70, s13, 8
+; SI-NEXT: s_lshr_b32 s71, s15, 24
+; SI-NEXT: s_lshr_b32 s80, s15, 16
+; SI-NEXT: s_lshr_b32 s81, s15, 8
+; SI-NEXT: s_lshr_b32 s82, s17, 24
+; SI-NEXT: s_lshr_b32 s83, s17, 16
+; SI-NEXT: s_lshr_b32 s84, s17, 8
+; SI-NEXT: s_lshr_b32 s85, s19, 24
+; SI-NEXT: s_lshr_b32 s86, s19, 16
+; SI-NEXT: s_lshr_b32 s87, s19, 8
+; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16
+; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8
+; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 8
+; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24
+; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8
+; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 24
+; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[30:31], s[10:11], 8
+; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[36:37], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 24
+; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 8
+; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 24
+; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16
+; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 8
; SI-NEXT: s_cbranch_execnz .LBB85_4
; SI-NEXT: .LBB85_2: ; %cmp.true
-; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0
-; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0
-; SI-NEXT: v_add_f64 v[13:14], s[22:23], 1.0
-; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0
+; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0
+; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0
+; SI-NEXT: v_add_f64 v[13:14], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0
; SI-NEXT: v_lshr_b64 v[48:49], v[28:29], 24
; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0
-; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0
-; SI-NEXT: v_add_f64 v[20:21], s[20:21], 1.0
-; SI-NEXT: v_add_f64 v[32:33], s[16:17], 1.0
+; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0
+; SI-NEXT: v_add_f64 v[20:21], s[14:15], 1.0
+; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0
; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16
; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16
; SI-NEXT: v_lshr_b64 v[49:50], v[28:29], 16
@@ -52512,13 +53100,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16
; SI-NEXT: v_lshr_b64 v[37:38], v[20:21], 16
; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16
-; SI-NEXT: v_readfirstlane_b32 s17, v33
-; SI-NEXT: v_readfirstlane_b32 s19, v29
-; SI-NEXT: v_readfirstlane_b32 s21, v21
-; SI-NEXT: v_readfirstlane_b32 s23, v14
-; SI-NEXT: v_readfirstlane_b32 s25, v8
-; SI-NEXT: v_readfirstlane_b32 s27, v6
-; SI-NEXT: v_readfirstlane_b32 s29, v4
+; SI-NEXT: v_readfirstlane_b32 s19, v33
+; SI-NEXT: v_readfirstlane_b32 s17, v29
+; SI-NEXT: v_readfirstlane_b32 s15, v21
+; SI-NEXT: v_readfirstlane_b32 s13, v14
+; SI-NEXT: v_readfirstlane_b32 s11, v8
+; SI-NEXT: v_readfirstlane_b32 s9, v6
+; SI-NEXT: v_readfirstlane_b32 s7, v4
; SI-NEXT: v_readfirstlane_b32 s5, v2
; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 8
; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 24
@@ -52529,27 +53117,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: s_lshr_b32 s48, s5, 24
; SI-NEXT: s_lshr_b32 s49, s5, 16
; SI-NEXT: s_lshr_b32 s50, s5, 8
-; SI-NEXT: s_lshr_b32 s51, s29, 24
-; SI-NEXT: s_lshr_b32 s52, s29, 16
-; SI-NEXT: s_lshr_b32 s53, s29, 8
-; SI-NEXT: s_lshr_b32 s54, s27, 24
-; SI-NEXT: s_lshr_b32 s55, s27, 16
-; SI-NEXT: s_lshr_b32 s64, s27, 8
-; SI-NEXT: s_lshr_b32 s65, s25, 24
-; SI-NEXT: s_lshr_b32 s66, s25, 16
-; SI-NEXT: s_lshr_b32 s67, s25, 8
-; SI-NEXT: s_lshr_b32 s68, s23, 24
-; SI-NEXT: s_lshr_b32 s69, s23, 16
-; SI-NEXT: s_lshr_b32 s70, s23, 8
-; SI-NEXT: s_lshr_b32 s71, s21, 24
-; SI-NEXT: s_lshr_b32 s80, s21, 16
-; SI-NEXT: s_lshr_b32 s81, s21, 8
-; SI-NEXT: s_lshr_b32 s82, s19, 24
-; SI-NEXT: s_lshr_b32 s83, s19, 16
-; SI-NEXT: s_lshr_b32 s84, s19, 8
-; SI-NEXT: s_lshr_b32 s85, s17, 24
-; SI-NEXT: s_lshr_b32 s86, s17, 16
-; SI-NEXT: s_lshr_b32 s87, s17, 8
+; SI-NEXT: s_lshr_b32 s51, s7, 24
+; SI-NEXT: s_lshr_b32 s52, s7, 16
+; SI-NEXT: s_lshr_b32 s53, s7, 8
+; SI-NEXT: s_lshr_b32 s54, s9, 24
+; SI-NEXT: s_lshr_b32 s55, s9, 16
+; SI-NEXT: s_lshr_b32 s64, s9, 8
+; SI-NEXT: s_lshr_b32 s65, s11, 24
+; SI-NEXT: s_lshr_b32 s66, s11, 16
+; SI-NEXT: s_lshr_b32 s67, s11, 8
+; SI-NEXT: s_lshr_b32 s68, s13, 24
+; SI-NEXT: s_lshr_b32 s69, s13, 16
+; SI-NEXT: s_lshr_b32 s70, s13, 8
+; SI-NEXT: s_lshr_b32 s71, s15, 24
+; SI-NEXT: s_lshr_b32 s80, s15, 16
+; SI-NEXT: s_lshr_b32 s81, s15, 8
+; SI-NEXT: s_lshr_b32 s82, s17, 24
+; SI-NEXT: s_lshr_b32 s83, s17, 16
+; SI-NEXT: s_lshr_b32 s84, s17, 8
+; SI-NEXT: s_lshr_b32 s85, s19, 24
+; SI-NEXT: s_lshr_b32 s86, s19, 16
+; SI-NEXT: s_lshr_b32 s87, s19, 8
; SI-NEXT: s_branch .LBB85_5
; SI-NEXT: .LBB85_3:
; SI-NEXT: ; implicit-def: $sgpr74
@@ -52565,8 +53153,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr83
; SI-NEXT: ; implicit-def: $sgpr82
; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr22
; SI-NEXT: ; implicit-def: $sgpr81
; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: ; implicit-def: $sgpr71
@@ -52597,18 +53185,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr72
; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr20
; SI-NEXT: s_branch .LBB85_2
; SI-NEXT: .LBB85_4:
-; SI-NEXT: v_mov_b32_e32 v32, s16
-; SI-NEXT: v_mov_b32_e32 v28, s18
-; SI-NEXT: v_mov_b32_e32 v20, s20
-; SI-NEXT: v_mov_b32_e32 v13, s22
-; SI-NEXT: v_mov_b32_e32 v7, s24
-; SI-NEXT: v_mov_b32_e32 v5, s26
-; SI-NEXT: v_mov_b32_e32 v3, s28
+; SI-NEXT: v_mov_b32_e32 v32, s18
+; SI-NEXT: v_mov_b32_e32 v28, s16
+; SI-NEXT: v_mov_b32_e32 v20, s14
+; SI-NEXT: v_mov_b32_e32 v13, s12
+; SI-NEXT: v_mov_b32_e32 v7, s10
+; SI-NEXT: v_mov_b32_e32 v5, s8
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: v_mov_b32_e32 v53, s74
; SI-NEXT: v_mov_b32_e32 v52, s62
@@ -52617,8 +53205,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v49, s44
; SI-NEXT: v_mov_b32_e32 v48, s42
; SI-NEXT: v_mov_b32_e32 v38, s40
-; SI-NEXT: v_mov_b32_e32 v37, s12
-; SI-NEXT: v_mov_b32_e32 v36, s8
+; SI-NEXT: v_mov_b32_e32 v37, s26
+; SI-NEXT: v_mov_b32_e32 v36, s22
; SI-NEXT: v_mov_b32_e32 v35, s38
; SI-NEXT: v_mov_b32_e32 v34, s36
; SI-NEXT: v_mov_b32_e32 v30, s34
@@ -52631,13 +53219,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v17, s72
; SI-NEXT: v_mov_b32_e32 v16, s60
; SI-NEXT: v_mov_b32_e32 v15, s46
-; SI-NEXT: v_mov_b32_e32 v11, s14
-; SI-NEXT: v_mov_b32_e32 v10, s10
-; SI-NEXT: v_mov_b32_e32 v9, s6
+; SI-NEXT: v_mov_b32_e32 v11, s28
+; SI-NEXT: v_mov_b32_e32 v10, s24
+; SI-NEXT: v_mov_b32_e32 v9, s20
; SI-NEXT: .LBB85_5: ; %end
; SI-NEXT: v_and_b32_e32 v2, 0xff, v32
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53
-; SI-NEXT: s_and_b32 s4, s17, 0xff
+; SI-NEXT: s_and_b32 s4, s19, 0xff
; SI-NEXT: s_lshl_b32 s6, s87, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v52
@@ -52646,11 +53234,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s85, 24
+; SI-NEXT: s_lshl_b32 s8, s85, 24
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
@@ -52661,7 +53249,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v2, 0xff, v28
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v50
-; SI-NEXT: s_and_b32 s4, s19, 0xff
+; SI-NEXT: s_and_b32 s4, s17, 0xff
; SI-NEXT: s_lshl_b32 s6, s84, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v49
@@ -52670,11 +53258,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s82, 24
+; SI-NEXT: s_lshl_b32 s8, s82, 24
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0
; SI-NEXT: s_or_b32 s4, s4, s6
@@ -52686,7 +53274,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v2, 0xff, v20
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v38
-; SI-NEXT: s_and_b32 s4, s21, 0xff
+; SI-NEXT: s_and_b32 s4, s15, 0xff
; SI-NEXT: s_lshl_b32 s6, s81, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v37
@@ -52695,11 +53283,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s71, 24
+; SI-NEXT: s_lshl_b32 s8, s71, 24
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
; SI-NEXT: s_or_b32 s4, s4, s6
@@ -52711,7 +53299,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v2, 0xff, v13
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35
-; SI-NEXT: s_and_b32 s4, s23, 0xff
+; SI-NEXT: s_and_b32 s4, s13, 0xff
; SI-NEXT: s_lshl_b32 s6, s70, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v34
@@ -52720,11 +53308,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s68, 24
+; SI-NEXT: s_lshl_b32 s8, s68, 24
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0
; SI-NEXT: s_or_b32 s4, s4, s6
@@ -52736,7 +53324,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v2, 0xff, v7
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26
-; SI-NEXT: s_and_b32 s4, s25, 0xff
+; SI-NEXT: s_and_b32 s4, s11, 0xff
; SI-NEXT: s_lshl_b32 s6, s67, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v25
@@ -52745,11 +53333,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s65, 24
+; SI-NEXT: s_lshl_b32 s8, s65, 24
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; SI-NEXT: s_or_b32 s4, s4, s6
@@ -52761,7 +53349,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v2, 0xff, v5
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23
-; SI-NEXT: s_and_b32 s4, s27, 0xff
+; SI-NEXT: s_and_b32 s4, s9, 0xff
; SI-NEXT: s_lshl_b32 s6, s64, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v22
@@ -52770,11 +53358,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v18
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s7, s54, 24
+; SI-NEXT: s_lshl_b32 s8, s54, 24
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0
; SI-NEXT: s_or_b32 s4, s4, s6
@@ -52785,7 +53373,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; SI-NEXT: v_and_b32_e32 v2, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17
-; SI-NEXT: s_and_b32 s4, s29, 0xff
+; SI-NEXT: s_and_b32 s4, s7, 0xff
; SI-NEXT: s_lshl_b32 s6, s53, 8
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v16
@@ -52895,10 +53483,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: v_writelane_b32 v40, s55, 15
; VI-NEXT: v_writelane_b32 v40, s64, 16
; VI-NEXT: v_writelane_b32 v40, s65, 17
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: v_writelane_b32 v40, s66, 18
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: v_writelane_b32 v40, s67, 19
; VI-NEXT: s_cbranch_scc0 .LBB85_3
@@ -52908,59 +53524,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s37, s4, 16
; VI-NEXT: s_lshr_b32 s36, s4, 8
-; VI-NEXT: s_lshr_b32 s59, s29, 24
-; VI-NEXT: s_lshr_b32 s60, s29, 16
-; VI-NEXT: s_lshr_b32 s61, s29, 8
-; VI-NEXT: s_lshr_b32 s39, s28, 16
-; VI-NEXT: s_lshr_b32 s38, s28, 8
-; VI-NEXT: s_lshr_b32 s62, s27, 24
-; VI-NEXT: s_lshr_b32 s63, s27, 16
-; VI-NEXT: s_lshr_b32 s72, s27, 8
-; VI-NEXT: s_lshr_b32 s49, s26, 16
-; VI-NEXT: s_lshr_b32 s48, s26, 8
-; VI-NEXT: s_lshr_b32 s73, s25, 24
-; VI-NEXT: s_lshr_b32 s74, s25, 16
-; VI-NEXT: s_lshr_b32 s75, s25, 8
-; VI-NEXT: s_lshr_b32 s51, s24, 16
-; VI-NEXT: s_lshr_b32 s50, s24, 8
-; VI-NEXT: s_lshr_b32 s76, s23, 24
-; VI-NEXT: s_lshr_b32 s77, s23, 16
-; VI-NEXT: s_lshr_b32 s78, s23, 8
-; VI-NEXT: s_lshr_b32 s53, s22, 16
-; VI-NEXT: s_lshr_b32 s52, s22, 8
-; VI-NEXT: s_lshr_b32 s79, s21, 24
-; VI-NEXT: s_lshr_b32 s88, s21, 16
-; VI-NEXT: s_lshr_b32 s89, s21, 8
-; VI-NEXT: s_lshr_b32 s55, s20, 16
-; VI-NEXT: s_lshr_b32 s54, s20, 8
-; VI-NEXT: s_lshr_b32 s90, s19, 24
-; VI-NEXT: s_lshr_b32 s91, s19, 16
-; VI-NEXT: s_lshr_b32 s30, s19, 8
-; VI-NEXT: s_lshr_b32 s65, s18, 16
-; VI-NEXT: s_lshr_b32 s64, s18, 8
-; VI-NEXT: s_lshr_b32 s31, s17, 24
-; VI-NEXT: s_lshr_b32 s34, s17, 16
-; VI-NEXT: s_lshr_b32 s35, s17, 8
-; VI-NEXT: s_lshr_b32 s67, s16, 16
-; VI-NEXT: s_lshr_b32 s66, s16, 8
+; VI-NEXT: s_lshr_b32 s59, s7, 24
+; VI-NEXT: s_lshr_b32 s60, s7, 16
+; VI-NEXT: s_lshr_b32 s61, s7, 8
+; VI-NEXT: s_lshr_b32 s39, s6, 16
+; VI-NEXT: s_lshr_b32 s38, s6, 8
+; VI-NEXT: s_lshr_b32 s62, s9, 24
+; VI-NEXT: s_lshr_b32 s63, s9, 16
+; VI-NEXT: s_lshr_b32 s72, s9, 8
+; VI-NEXT: s_lshr_b32 s49, s8, 16
+; VI-NEXT: s_lshr_b32 s48, s8, 8
+; VI-NEXT: s_lshr_b32 s73, s11, 24
+; VI-NEXT: s_lshr_b32 s74, s11, 16
+; VI-NEXT: s_lshr_b32 s75, s11, 8
+; VI-NEXT: s_lshr_b32 s51, s10, 16
+; VI-NEXT: s_lshr_b32 s50, s10, 8
+; VI-NEXT: s_lshr_b32 s76, s13, 24
+; VI-NEXT: s_lshr_b32 s77, s13, 16
+; VI-NEXT: s_lshr_b32 s78, s13, 8
+; VI-NEXT: s_lshr_b32 s53, s12, 16
+; VI-NEXT: s_lshr_b32 s52, s12, 8
+; VI-NEXT: s_lshr_b32 s79, s15, 24
+; VI-NEXT: s_lshr_b32 s88, s15, 16
+; VI-NEXT: s_lshr_b32 s89, s15, 8
+; VI-NEXT: s_lshr_b32 s55, s14, 16
+; VI-NEXT: s_lshr_b32 s54, s14, 8
+; VI-NEXT: s_lshr_b32 s90, s17, 24
+; VI-NEXT: s_lshr_b32 s91, s17, 16
+; VI-NEXT: s_lshr_b32 s30, s17, 8
+; VI-NEXT: s_lshr_b32 s65, s16, 16
+; VI-NEXT: s_lshr_b32 s64, s16, 8
+; VI-NEXT: s_lshr_b32 s31, s19, 24
+; VI-NEXT: s_lshr_b32 s34, s19, 16
+; VI-NEXT: s_lshr_b32 s35, s19, 8
+; VI-NEXT: s_lshr_b32 s67, s18, 16
+; VI-NEXT: s_lshr_b32 s66, s18, 8
; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB85_4
; VI-NEXT: .LBB85_2: ; %cmp.true
; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0
-; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0
-; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0
-; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0
-; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0
-; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0
-; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0
-; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0
+; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0
+; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0
+; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0
+; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0
+; VI-NEXT: v_add_f64 v[15:16], s[14:15], 1.0
+; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0
+; VI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0
; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2]
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6]
@@ -52968,13 +53584,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12]
; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16]
; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10]
-; VI-NEXT: v_readfirstlane_b32 s17, v14
-; VI-NEXT: v_readfirstlane_b32 s19, v10
-; VI-NEXT: v_readfirstlane_b32 s21, v16
-; VI-NEXT: v_readfirstlane_b32 s23, v12
-; VI-NEXT: v_readfirstlane_b32 s25, v8
-; VI-NEXT: v_readfirstlane_b32 s27, v6
-; VI-NEXT: v_readfirstlane_b32 s29, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v14
+; VI-NEXT: v_readfirstlane_b32 s17, v10
+; VI-NEXT: v_readfirstlane_b32 s15, v16
+; VI-NEXT: v_readfirstlane_b32 s13, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v8
+; VI-NEXT: v_readfirstlane_b32 s9, v6
+; VI-NEXT: v_readfirstlane_b32 s7, v4
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14]
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -52982,70 +53598,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1
-; VI-NEXT: s_lshr_b32 s59, s29, 24
-; VI-NEXT: s_lshr_b32 s60, s29, 16
-; VI-NEXT: s_lshr_b32 s61, s29, 8
+; VI-NEXT: s_lshr_b32 s59, s7, 24
+; VI-NEXT: s_lshr_b32 s60, s7, 16
+; VI-NEXT: s_lshr_b32 s61, s7, 8
; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; VI-NEXT: s_lshr_b32 s62, s27, 24
-; VI-NEXT: s_lshr_b32 s63, s27, 16
-; VI-NEXT: s_lshr_b32 s72, s27, 8
+; VI-NEXT: s_lshr_b32 s62, s9, 24
+; VI-NEXT: s_lshr_b32 s63, s9, 16
+; VI-NEXT: s_lshr_b32 s72, s9, 8
; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT: s_lshr_b32 s73, s25, 24
-; VI-NEXT: s_lshr_b32 s74, s25, 16
-; VI-NEXT: s_lshr_b32 s75, s25, 8
+; VI-NEXT: s_lshr_b32 s73, s11, 24
+; VI-NEXT: s_lshr_b32 s74, s11, 16
+; VI-NEXT: s_lshr_b32 s75, s11, 8
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7
-; VI-NEXT: s_lshr_b32 s76, s23, 24
-; VI-NEXT: s_lshr_b32 s77, s23, 16
-; VI-NEXT: s_lshr_b32 s78, s23, 8
+; VI-NEXT: s_lshr_b32 s76, s13, 24
+; VI-NEXT: s_lshr_b32 s77, s13, 16
+; VI-NEXT: s_lshr_b32 s78, s13, 8
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11
-; VI-NEXT: s_lshr_b32 s79, s21, 24
-; VI-NEXT: s_lshr_b32 s88, s21, 16
-; VI-NEXT: s_lshr_b32 s89, s21, 8
+; VI-NEXT: s_lshr_b32 s79, s15, 24
+; VI-NEXT: s_lshr_b32 s88, s15, 16
+; VI-NEXT: s_lshr_b32 s89, s15, 8
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15
; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15
-; VI-NEXT: s_lshr_b32 s90, s19, 24
-; VI-NEXT: s_lshr_b32 s91, s19, 16
-; VI-NEXT: s_lshr_b32 s30, s19, 8
+; VI-NEXT: s_lshr_b32 s90, s17, 24
+; VI-NEXT: s_lshr_b32 s91, s17, 16
+; VI-NEXT: s_lshr_b32 s30, s17, 8
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9
-; VI-NEXT: s_lshr_b32 s31, s17, 24
-; VI-NEXT: s_lshr_b32 s34, s17, 16
-; VI-NEXT: s_lshr_b32 s35, s17, 8
+; VI-NEXT: s_lshr_b32 s31, s19, 24
+; VI-NEXT: s_lshr_b32 s34, s19, 16
+; VI-NEXT: s_lshr_b32 s35, s19, 8
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13
; VI-NEXT: s_branch .LBB85_5
; VI-NEXT: .LBB85_3:
; VI-NEXT: ; implicit-def: $sgpr66
; VI-NEXT: ; implicit-def: $sgpr67
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr20
; VI-NEXT: ; implicit-def: $sgpr35
; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr31
; VI-NEXT: ; implicit-def: $sgpr64
; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr90
; VI-NEXT: ; implicit-def: $sgpr54
; VI-NEXT: ; implicit-def: $sgpr55
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr79
; VI-NEXT: ; implicit-def: $sgpr52
; VI-NEXT: ; implicit-def: $sgpr53
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr77
; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr50
; VI-NEXT: ; implicit-def: $sgpr51
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr73
@@ -53069,8 +53685,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: s_branch .LBB85_2
; VI-NEXT: .LBB85_4:
-; VI-NEXT: v_mov_b32_e32 v13, s16
-; VI-NEXT: v_mov_b32_e32 v9, s18
+; VI-NEXT: v_mov_b32_e32 v13, s18
+; VI-NEXT: v_mov_b32_e32 v9, s16
; VI-NEXT: v_mov_b32_e32 v48, s67
; VI-NEXT: v_mov_b32_e32 v49, s66
; VI-NEXT: v_mov_b32_e32 v38, s65
@@ -53087,27 +53703,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v29, s38
; VI-NEXT: v_mov_b32_e32 v26, s37
; VI-NEXT: v_mov_b32_e32 v27, s36
-; VI-NEXT: v_mov_b32_e32 v15, s20
-; VI-NEXT: v_mov_b32_e32 v11, s22
-; VI-NEXT: v_mov_b32_e32 v7, s24
-; VI-NEXT: v_mov_b32_e32 v5, s26
-; VI-NEXT: v_mov_b32_e32 v3, s28
+; VI-NEXT: v_mov_b32_e32 v15, s14
+; VI-NEXT: v_mov_b32_e32 v11, s12
+; VI-NEXT: v_mov_b32_e32 v7, s10
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v24, s6
-; VI-NEXT: v_mov_b32_e32 v23, s8
-; VI-NEXT: v_mov_b32_e32 v22, s10
-; VI-NEXT: v_mov_b32_e32 v21, s12
-; VI-NEXT: v_mov_b32_e32 v20, s14
+; VI-NEXT: v_mov_b32_e32 v24, s20
+; VI-NEXT: v_mov_b32_e32 v23, s22
+; VI-NEXT: v_mov_b32_e32 v22, s24
+; VI-NEXT: v_mov_b32_e32 v21, s26
+; VI-NEXT: v_mov_b32_e32 v20, s28
; VI-NEXT: v_mov_b32_e32 v19, s40
; VI-NEXT: v_mov_b32_e32 v18, s42
; VI-NEXT: v_mov_b32_e32 v17, s44
; VI-NEXT: .LBB85_5: ; %end
-; VI-NEXT: s_and_b32 s4, s17, 0xff
+; VI-NEXT: s_and_b32 s4, s19, 0xff
; VI-NEXT: s_lshl_b32 s6, s35, 8
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s34, 0xff
-; VI-NEXT: s_lshl_b32 s7, s31, 8
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_lshl_b32 s8, s31, 8
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53117,17 +53733,17 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s19, 0xff
+; VI-NEXT: s_and_b32 s4, s17, 0xff
; VI-NEXT: s_lshl_b32 s6, s30, 8
; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s91, 0xff
-; VI-NEXT: s_lshl_b32 s7, s90, 8
+; VI-NEXT: s_lshl_b32 s8, s90, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53137,16 +53753,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s21, 0xff
+; VI-NEXT: s_and_b32 s4, s15, 0xff
; VI-NEXT: s_lshl_b32 s6, s89, 8
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s88, 0xff
-; VI-NEXT: s_lshl_b32 s7, s79, 8
+; VI-NEXT: s_lshl_b32 s8, s79, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53156,16 +53772,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s23, 0xff
+; VI-NEXT: s_and_b32 s4, s13, 0xff
; VI-NEXT: s_lshl_b32 s6, s78, 8
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s77, 0xff
-; VI-NEXT: s_lshl_b32 s7, s76, 8
+; VI-NEXT: s_lshl_b32 s8, s76, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53175,16 +53791,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s25, 0xff
+; VI-NEXT: s_and_b32 s4, s11, 0xff
; VI-NEXT: s_lshl_b32 s6, s75, 8
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s74, 0xff
-; VI-NEXT: s_lshl_b32 s7, s73, 8
+; VI-NEXT: s_lshl_b32 s8, s73, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53194,16 +53810,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s27, 0xff
+; VI-NEXT: s_and_b32 s4, s9, 0xff
; VI-NEXT: s_lshl_b32 s6, s72, 8
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_and_b32 s6, s63, 0xff
-; VI-NEXT: s_lshl_b32 s7, s62, 8
+; VI-NEXT: s_lshl_b32 s8, s62, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19
-; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53214,7 +53830,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_and_b32 s4, s29, 0xff
+; VI-NEXT: s_and_b32 s4, s7, 0xff
; VI-NEXT: s_lshl_b32 s6, s61, 8
; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29
@@ -53299,10 +53915,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v40, s51, 11
; GFX9-NEXT: v_writelane_b32 v40, s52, 12
; GFX9-NEXT: v_writelane_b32 v40, s53, 13
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX9-NEXT: v_writelane_b32 v40, s54, 14
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: v_writelane_b32 v40, s55, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB85_3
@@ -53312,59 +53956,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: s_lshr_b32 s31, s4, 16
; GFX9-NEXT: s_lshr_b32 s30, s4, 8
-; GFX9-NEXT: s_lshr_b32 s59, s29, 24
-; GFX9-NEXT: s_lshr_b32 s60, s29, 16
-; GFX9-NEXT: s_lshr_b32 s61, s29, 8
-; GFX9-NEXT: s_lshr_b32 s35, s28, 16
-; GFX9-NEXT: s_lshr_b32 s34, s28, 8
-; GFX9-NEXT: s_lshr_b32 s62, s27, 24
-; GFX9-NEXT: s_lshr_b32 s63, s27, 16
-; GFX9-NEXT: s_lshr_b32 s72, s27, 8
-; GFX9-NEXT: s_lshr_b32 s37, s26, 16
-; GFX9-NEXT: s_lshr_b32 s36, s26, 8
-; GFX9-NEXT: s_lshr_b32 s73, s25, 24
-; GFX9-NEXT: s_lshr_b32 s74, s25, 16
-; GFX9-NEXT: s_lshr_b32 s75, s25, 8
-; GFX9-NEXT: s_lshr_b32 s39, s24, 16
-; GFX9-NEXT: s_lshr_b32 s38, s24, 8
-; GFX9-NEXT: s_lshr_b32 s76, s23, 24
-; GFX9-NEXT: s_lshr_b32 s77, s23, 16
-; GFX9-NEXT: s_lshr_b32 s78, s23, 8
-; GFX9-NEXT: s_lshr_b32 s49, s22, 16
-; GFX9-NEXT: s_lshr_b32 s48, s22, 8
-; GFX9-NEXT: s_lshr_b32 s79, s21, 24
-; GFX9-NEXT: s_lshr_b32 s88, s21, 16
-; GFX9-NEXT: s_lshr_b32 s89, s21, 8
-; GFX9-NEXT: s_lshr_b32 s51, s20, 16
-; GFX9-NEXT: s_lshr_b32 s50, s20, 8
-; GFX9-NEXT: s_lshr_b32 s90, s19, 24
-; GFX9-NEXT: s_lshr_b32 s91, s19, 16
-; GFX9-NEXT: s_lshr_b32 s92, s19, 8
-; GFX9-NEXT: s_lshr_b32 s53, s18, 16
-; GFX9-NEXT: s_lshr_b32 s52, s18, 8
-; GFX9-NEXT: s_lshr_b32 s93, s17, 24
-; GFX9-NEXT: s_lshr_b32 s94, s17, 16
-; GFX9-NEXT: s_lshr_b32 s95, s17, 8
-; GFX9-NEXT: s_lshr_b32 s55, s16, 16
-; GFX9-NEXT: s_lshr_b32 s54, s16, 8
+; GFX9-NEXT: s_lshr_b32 s59, s7, 24
+; GFX9-NEXT: s_lshr_b32 s60, s7, 16
+; GFX9-NEXT: s_lshr_b32 s61, s7, 8
+; GFX9-NEXT: s_lshr_b32 s35, s6, 16
+; GFX9-NEXT: s_lshr_b32 s34, s6, 8
+; GFX9-NEXT: s_lshr_b32 s62, s9, 24
+; GFX9-NEXT: s_lshr_b32 s63, s9, 16
+; GFX9-NEXT: s_lshr_b32 s72, s9, 8
+; GFX9-NEXT: s_lshr_b32 s37, s8, 16
+; GFX9-NEXT: s_lshr_b32 s36, s8, 8
+; GFX9-NEXT: s_lshr_b32 s73, s11, 24
+; GFX9-NEXT: s_lshr_b32 s74, s11, 16
+; GFX9-NEXT: s_lshr_b32 s75, s11, 8
+; GFX9-NEXT: s_lshr_b32 s39, s10, 16
+; GFX9-NEXT: s_lshr_b32 s38, s10, 8
+; GFX9-NEXT: s_lshr_b32 s76, s13, 24
+; GFX9-NEXT: s_lshr_b32 s77, s13, 16
+; GFX9-NEXT: s_lshr_b32 s78, s13, 8
+; GFX9-NEXT: s_lshr_b32 s49, s12, 16
+; GFX9-NEXT: s_lshr_b32 s48, s12, 8
+; GFX9-NEXT: s_lshr_b32 s79, s15, 24
+; GFX9-NEXT: s_lshr_b32 s88, s15, 16
+; GFX9-NEXT: s_lshr_b32 s89, s15, 8
+; GFX9-NEXT: s_lshr_b32 s51, s14, 16
+; GFX9-NEXT: s_lshr_b32 s50, s14, 8
+; GFX9-NEXT: s_lshr_b32 s90, s17, 24
+; GFX9-NEXT: s_lshr_b32 s91, s17, 16
+; GFX9-NEXT: s_lshr_b32 s92, s17, 8
+; GFX9-NEXT: s_lshr_b32 s53, s16, 16
+; GFX9-NEXT: s_lshr_b32 s52, s16, 8
+; GFX9-NEXT: s_lshr_b32 s93, s19, 24
+; GFX9-NEXT: s_lshr_b32 s94, s19, 16
+; GFX9-NEXT: s_lshr_b32 s95, s19, 8
+; GFX9-NEXT: s_lshr_b32 s55, s18, 16
+; GFX9-NEXT: s_lshr_b32 s54, s18, 8
; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB85_4
; GFX9-NEXT: .LBB85_2: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0
-; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0
-; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0
-; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0
-; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0
-; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0
-; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0
-; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0
+; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0
+; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0
+; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0
+; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0
+; GFX9-NEXT: v_add_f64 v[15:16], s[14:15], 1.0
+; GFX9-NEXT: v_add_f64 v[11:12], s[16:17], 1.0
+; GFX9-NEXT: v_add_f64 v[13:14], s[18:19], 1.0
; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2]
; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6]
@@ -53372,13 +54016,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10]
; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16]
; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12]
-; GFX9-NEXT: v_readfirstlane_b32 s17, v14
-; GFX9-NEXT: v_readfirstlane_b32 s19, v12
-; GFX9-NEXT: v_readfirstlane_b32 s21, v16
-; GFX9-NEXT: v_readfirstlane_b32 s23, v10
-; GFX9-NEXT: v_readfirstlane_b32 s25, v8
-; GFX9-NEXT: v_readfirstlane_b32 s27, v6
-; GFX9-NEXT: v_readfirstlane_b32 s29, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v14
+; GFX9-NEXT: v_readfirstlane_b32 s17, v12
+; GFX9-NEXT: v_readfirstlane_b32 s15, v16
+; GFX9-NEXT: v_readfirstlane_b32 s13, v10
+; GFX9-NEXT: v_readfirstlane_b32 s11, v8
+; GFX9-NEXT: v_readfirstlane_b32 s9, v6
+; GFX9-NEXT: v_readfirstlane_b32 s7, v4
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14]
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -53386,70 +54030,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s58, s5, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1
-; GFX9-NEXT: s_lshr_b32 s59, s29, 24
-; GFX9-NEXT: s_lshr_b32 s60, s29, 16
-; GFX9-NEXT: s_lshr_b32 s61, s29, 8
+; GFX9-NEXT: s_lshr_b32 s59, s7, 24
+; GFX9-NEXT: s_lshr_b32 s60, s7, 16
+; GFX9-NEXT: s_lshr_b32 s61, s7, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX9-NEXT: s_lshr_b32 s62, s27, 24
-; GFX9-NEXT: s_lshr_b32 s63, s27, 16
-; GFX9-NEXT: s_lshr_b32 s72, s27, 8
+; GFX9-NEXT: s_lshr_b32 s62, s9, 24
+; GFX9-NEXT: s_lshr_b32 s63, s9, 16
+; GFX9-NEXT: s_lshr_b32 s72, s9, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5
-; GFX9-NEXT: s_lshr_b32 s73, s25, 24
-; GFX9-NEXT: s_lshr_b32 s74, s25, 16
-; GFX9-NEXT: s_lshr_b32 s75, s25, 8
+; GFX9-NEXT: s_lshr_b32 s73, s11, 24
+; GFX9-NEXT: s_lshr_b32 s74, s11, 16
+; GFX9-NEXT: s_lshr_b32 s75, s11, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
-; GFX9-NEXT: s_lshr_b32 s76, s23, 24
-; GFX9-NEXT: s_lshr_b32 s77, s23, 16
-; GFX9-NEXT: s_lshr_b32 s78, s23, 8
+; GFX9-NEXT: s_lshr_b32 s76, s13, 24
+; GFX9-NEXT: s_lshr_b32 s77, s13, 16
+; GFX9-NEXT: s_lshr_b32 s78, s13, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9
-; GFX9-NEXT: s_lshr_b32 s79, s21, 24
-; GFX9-NEXT: s_lshr_b32 s88, s21, 16
-; GFX9-NEXT: s_lshr_b32 s89, s21, 8
+; GFX9-NEXT: s_lshr_b32 s79, s15, 24
+; GFX9-NEXT: s_lshr_b32 s88, s15, 16
+; GFX9-NEXT: s_lshr_b32 s89, s15, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15
-; GFX9-NEXT: s_lshr_b32 s90, s19, 24
-; GFX9-NEXT: s_lshr_b32 s91, s19, 16
-; GFX9-NEXT: s_lshr_b32 s92, s19, 8
+; GFX9-NEXT: s_lshr_b32 s90, s17, 24
+; GFX9-NEXT: s_lshr_b32 s91, s17, 16
+; GFX9-NEXT: s_lshr_b32 s92, s17, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11
-; GFX9-NEXT: s_lshr_b32 s93, s17, 24
-; GFX9-NEXT: s_lshr_b32 s94, s17, 16
-; GFX9-NEXT: s_lshr_b32 s95, s17, 8
+; GFX9-NEXT: s_lshr_b32 s93, s19, 24
+; GFX9-NEXT: s_lshr_b32 s94, s19, 16
+; GFX9-NEXT: s_lshr_b32 s95, s19, 8
; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13
; GFX9-NEXT: s_branch .LBB85_5
; GFX9-NEXT: .LBB85_3:
; GFX9-NEXT: ; implicit-def: $sgpr54
; GFX9-NEXT: ; implicit-def: $sgpr55
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr94
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr52
; GFX9-NEXT: ; implicit-def: $sgpr53
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr50
; GFX9-NEXT: ; implicit-def: $sgpr51
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr89
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
; GFX9-NEXT: ; implicit-def: $sgpr48
; GFX9-NEXT: ; implicit-def: $sgpr49
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr78
; GFX9-NEXT: ; implicit-def: $sgpr77
; GFX9-NEXT: ; implicit-def: $sgpr76
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr39
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr75
; GFX9-NEXT: ; implicit-def: $sgpr74
; GFX9-NEXT: ; implicit-def: $sgpr73
@@ -53473,8 +54117,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $sgpr56
; GFX9-NEXT: s_branch .LBB85_2
; GFX9-NEXT: .LBB85_4:
-; GFX9-NEXT: v_mov_b32_e32 v13, s16
-; GFX9-NEXT: v_mov_b32_e32 v11, s18
+; GFX9-NEXT: v_mov_b32_e32 v13, s18
+; GFX9-NEXT: v_mov_b32_e32 v11, s16
; GFX9-NEXT: v_mov_b32_e32 v39, s55
; GFX9-NEXT: v_mov_b32_e32 v49, s54
; GFX9-NEXT: v_mov_b32_e32 v37, s53
@@ -53491,29 +54135,29 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v29, s34
; GFX9-NEXT: v_mov_b32_e32 v26, s31
; GFX9-NEXT: v_mov_b32_e32 v27, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s20
-; GFX9-NEXT: v_mov_b32_e32 v9, s22
-; GFX9-NEXT: v_mov_b32_e32 v7, s24
-; GFX9-NEXT: v_mov_b32_e32 v5, s26
-; GFX9-NEXT: v_mov_b32_e32 v3, s28
+; GFX9-NEXT: v_mov_b32_e32 v15, s14
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_mov_b32_e32 v24, s6
-; GFX9-NEXT: v_mov_b32_e32 v23, s8
-; GFX9-NEXT: v_mov_b32_e32 v22, s10
-; GFX9-NEXT: v_mov_b32_e32 v21, s12
-; GFX9-NEXT: v_mov_b32_e32 v20, s14
+; GFX9-NEXT: v_mov_b32_e32 v24, s20
+; GFX9-NEXT: v_mov_b32_e32 v23, s22
+; GFX9-NEXT: v_mov_b32_e32 v22, s24
+; GFX9-NEXT: v_mov_b32_e32 v21, s26
+; GFX9-NEXT: v_mov_b32_e32 v20, s28
; GFX9-NEXT: v_mov_b32_e32 v19, s40
; GFX9-NEXT: v_mov_b32_e32 v18, s42
; GFX9-NEXT: v_mov_b32_e32 v17, s44
; GFX9-NEXT: .LBB85_5: ; %end
-; GFX9-NEXT: s_and_b32 s4, s17, 0xff
+; GFX9-NEXT: s_and_b32 s4, s19, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s95, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s94, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s93, 8
+; GFX9-NEXT: s_lshl_b32 s8, s93, 8
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53522,15 +54166,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s19, 0xff
+; GFX9-NEXT: s_and_b32 s4, s17, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s92, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s91, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s90, 8
+; GFX9-NEXT: s_lshl_b32 s8, s90, 8
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53539,15 +54183,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s21, 0xff
+; GFX9-NEXT: s_and_b32 s4, s15, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s89, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s88, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s79, 8
+; GFX9-NEXT: s_lshl_b32 s8, s79, 8
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53556,15 +54200,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s23, 0xff
+; GFX9-NEXT: s_and_b32 s4, s13, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s78, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s76, 8
+; GFX9-NEXT: s_lshl_b32 s8, s76, 8
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53573,15 +54217,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s25, 0xff
+; GFX9-NEXT: s_and_b32 s4, s11, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s75, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s74, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s73, 8
+; GFX9-NEXT: s_lshl_b32 s8, s73, 8
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53590,15 +54234,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s27, 0xff
+; GFX9-NEXT: s_and_b32 s4, s9, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s72, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s63, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s62, 8
+; GFX9-NEXT: s_lshl_b32 s8, s62, 8
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19
-; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
@@ -53607,7 +54251,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_and_b32 s4, s29, 0xff
+; GFX9-NEXT: s_and_b32 s4, s7, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s61, 8
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29
@@ -58875,111 +59519,139 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; VI-LABEL: bitcast_v32i16_to_v32f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_readfirstlane_b32 s21, v3
+; VI-NEXT: v_readfirstlane_b32 s20, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s18, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s16, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s14, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s12, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s10, v14
+; VI-NEXT: v_readfirstlane_b32 s8, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v16
; VI-NEXT: v_readfirstlane_b32 s6, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_cbranch_scc0 .LBB89_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB89_3
; VI-NEXT: .LBB89_2: ; %cmp.true
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_and_b32 s8, s17, 0xffff0000
-; VI-NEXT: s_add_i32 s9, s17, 3
-; VI-NEXT: s_and_b32 s10, s18, 0xffff0000
-; VI-NEXT: s_add_i32 s11, s18, 3
-; VI-NEXT: s_and_b32 s12, s19, 0xffff0000
-; VI-NEXT: s_add_i32 s13, s19, 3
-; VI-NEXT: s_and_b32 s14, s20, 0xffff0000
-; VI-NEXT: s_add_i32 s15, s20, 3
-; VI-NEXT: s_and_b32 s16, s21, 0xffff0000
-; VI-NEXT: s_add_i32 s17, s21, 3
-; VI-NEXT: s_and_b32 s18, s22, 0xffff0000
-; VI-NEXT: s_add_i32 s19, s22, 3
-; VI-NEXT: s_and_b32 s20, s23, 0xffff0000
-; VI-NEXT: s_add_i32 s21, s23, 3
-; VI-NEXT: s_and_b32 s22, s24, 0xffff0000
-; VI-NEXT: s_add_i32 s23, s24, 3
-; VI-NEXT: s_and_b32 s24, s25, 0xffff0000
-; VI-NEXT: s_add_i32 s25, s25, 3
-; VI-NEXT: s_and_b32 s40, s26, 0xffff0000
-; VI-NEXT: s_add_i32 s26, s26, 3
-; VI-NEXT: s_and_b32 s41, s27, 0xffff0000
-; VI-NEXT: s_add_i32 s27, s27, 3
-; VI-NEXT: s_and_b32 s42, s28, 0xffff0000
-; VI-NEXT: s_add_i32 s28, s28, 3
-; VI-NEXT: s_and_b32 s43, s29, 0xffff0000
-; VI-NEXT: s_add_i32 s29, s29, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s21, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s20, s20, 3
+; VI-NEXT: s_and_b32 s22, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s19, s19, 3
+; VI-NEXT: s_and_b32 s23, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s18, s18, 3
+; VI-NEXT: s_and_b32 s24, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s17, s17, 3
+; VI-NEXT: s_and_b32 s25, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s16, s16, 3
+; VI-NEXT: s_and_b32 s26, s15, 0xffff0000
+; VI-NEXT: s_add_i32 s15, s15, 3
+; VI-NEXT: s_and_b32 s27, s14, 0xffff0000
+; VI-NEXT: s_add_i32 s14, s14, 3
+; VI-NEXT: s_and_b32 s28, s13, 0xffff0000
+; VI-NEXT: s_add_i32 s13, s13, 3
+; VI-NEXT: s_and_b32 s29, s12, 0xffff0000
+; VI-NEXT: s_add_i32 s12, s12, 3
+; VI-NEXT: s_and_b32 s40, s11, 0xffff0000
+; VI-NEXT: s_add_i32 s11, s11, 3
+; VI-NEXT: s_and_b32 s41, s10, 0xffff0000
+; VI-NEXT: s_add_i32 s10, s10, 3
+; VI-NEXT: s_and_b32 s42, s8, 0xffff0000
+; VI-NEXT: s_add_i32 s8, s8, 3
+; VI-NEXT: s_and_b32 s43, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_and_b32 s44, s6, 0xffff0000
; VI-NEXT: s_add_i32 s6, s6, 3
-; VI-NEXT: s_and_b32 s45, s7, 0xffff0000
-; VI-NEXT: s_add_i32 s7, s7, 3
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s45, s9, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s9, 3
+; VI-NEXT: s_and_b32 s9, s9, 0xffff
; VI-NEXT: s_and_b32 s6, s6, 0xffff
-; VI-NEXT: s_and_b32 s29, s29, 0xffff
-; VI-NEXT: s_and_b32 s28, s28, 0xffff
-; VI-NEXT: s_and_b32 s27, s27, 0xffff
-; VI-NEXT: s_and_b32 s26, s26, 0xffff
-; VI-NEXT: s_and_b32 s25, s25, 0xffff
-; VI-NEXT: s_and_b32 s23, s23, 0xffff
-; VI-NEXT: s_and_b32 s21, s21, 0xffff
-; VI-NEXT: s_and_b32 s19, s19, 0xffff
-; VI-NEXT: s_and_b32 s17, s17, 0xffff
-; VI-NEXT: s_and_b32 s15, s15, 0xffff
-; VI-NEXT: s_and_b32 s13, s13, 0xffff
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
; VI-NEXT: s_and_b32 s11, s11, 0xffff
-; VI-NEXT: s_and_b32 s9, s9, 0xffff
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_and_b32 s13, s13, 0xffff
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_and_b32 s15, s15, 0xffff
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_and_b32 s17, s17, 0xffff
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_and_b32 s19, s19, 0xffff
+; VI-NEXT: s_and_b32 s20, s20, 0xffff
; VI-NEXT: s_and_b32 s5, s5, 0xffff
-; VI-NEXT: s_or_b32 s7, s45, s7
+; VI-NEXT: s_or_b32 s9, s45, s9
; VI-NEXT: s_or_b32 s6, s44, s6
-; VI-NEXT: s_or_b32 s29, s43, s29
-; VI-NEXT: s_or_b32 s28, s42, s28
-; VI-NEXT: s_or_b32 s27, s41, s27
-; VI-NEXT: s_or_b32 s26, s40, s26
-; VI-NEXT: s_or_b32 s24, s24, s25
-; VI-NEXT: s_or_b32 s22, s22, s23
-; VI-NEXT: s_or_b32 s20, s20, s21
-; VI-NEXT: s_or_b32 s18, s18, s19
-; VI-NEXT: s_or_b32 s16, s16, s17
-; VI-NEXT: s_or_b32 s14, s14, s15
-; VI-NEXT: s_or_b32 s12, s12, s13
-; VI-NEXT: s_or_b32 s10, s10, s11
-; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_or_b32 s7, s43, s7
+; VI-NEXT: s_or_b32 s8, s42, s8
+; VI-NEXT: s_or_b32 s10, s41, s10
+; VI-NEXT: s_or_b32 s11, s40, s11
+; VI-NEXT: s_or_b32 s12, s29, s12
+; VI-NEXT: s_or_b32 s13, s28, s13
+; VI-NEXT: s_or_b32 s14, s27, s14
+; VI-NEXT: s_or_b32 s15, s26, s15
+; VI-NEXT: s_or_b32 s16, s25, s16
+; VI-NEXT: s_or_b32 s17, s24, s17
+; VI-NEXT: s_or_b32 s18, s23, s18
+; VI-NEXT: s_or_b32 s19, s22, s19
+; VI-NEXT: s_or_b32 s20, s21, s20
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s7, s7, 0x30000
+; VI-NEXT: s_add_i32 s9, s9, 0x30000
; VI-NEXT: s_add_i32 s6, s6, 0x30000
-; VI-NEXT: s_add_i32 s29, s29, 0x30000
-; VI-NEXT: s_add_i32 s28, s28, 0x30000
-; VI-NEXT: s_add_i32 s27, s27, 0x30000
-; VI-NEXT: s_add_i32 s26, s26, 0x30000
-; VI-NEXT: s_add_i32 s25, s24, 0x30000
-; VI-NEXT: s_add_i32 s24, s22, 0x30000
-; VI-NEXT: s_add_i32 s23, s20, 0x30000
-; VI-NEXT: s_add_i32 s22, s18, 0x30000
-; VI-NEXT: s_add_i32 s21, s16, 0x30000
-; VI-NEXT: s_add_i32 s20, s14, 0x30000
-; VI-NEXT: s_add_i32 s19, s12, 0x30000
-; VI-NEXT: s_add_i32 s18, s10, 0x30000
-; VI-NEXT: s_add_i32 s17, s8, 0x30000
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s7, s7, 0x30000
+; VI-NEXT: s_add_i32 s8, s8, 0x30000
+; VI-NEXT: s_add_i32 s10, s10, 0x30000
+; VI-NEXT: s_add_i32 s11, s11, 0x30000
+; VI-NEXT: s_add_i32 s12, s12, 0x30000
+; VI-NEXT: s_add_i32 s13, s13, 0x30000
+; VI-NEXT: s_add_i32 s14, s14, 0x30000
+; VI-NEXT: s_add_i32 s15, s15, 0x30000
+; VI-NEXT: s_add_i32 s16, s16, 0x30000
+; VI-NEXT: s_add_i32 s17, s17, 0x30000
+; VI-NEXT: s_add_i32 s18, s18, 0x30000
+; VI-NEXT: s_add_i32 s19, s19, 0x30000
+; VI-NEXT: s_add_i32 s20, s20, 0x30000
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
; VI-NEXT: .LBB89_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
+; VI-NEXT: v_mov_b32_e32 v0, s21
+; VI-NEXT: v_mov_b32_e32 v1, s20
+; VI-NEXT: v_mov_b32_e32 v2, s19
+; VI-NEXT: v_mov_b32_e32 v3, s18
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s16
+; VI-NEXT: v_mov_b32_e32 v6, s15
+; VI-NEXT: v_mov_b32_e32 v7, s14
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mov_b32_e32 v9, s12
+; VI-NEXT: v_mov_b32_e32 v10, s11
+; VI-NEXT: v_mov_b32_e32 v11, s10
+; VI-NEXT: v_mov_b32_e32 v12, s8
+; VI-NEXT: v_mov_b32_e32 v13, s7
; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v15, s9
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB89_4:
; VI-NEXT: s_branch .LBB89_2
@@ -60336,111 +61008,139 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s20
+; VI-NEXT: v_mov_b32_e32 v8, s21
+; VI-NEXT: v_mov_b32_e32 v9, s22
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v11, s24
+; VI-NEXT: v_mov_b32_e32 v12, s25
+; VI-NEXT: v_mov_b32_e32 v13, s26
+; VI-NEXT: v_mov_b32_e32 v14, s27
+; VI-NEXT: v_mov_b32_e32 v15, s28
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_readfirstlane_b32 s21, v3
+; VI-NEXT: v_readfirstlane_b32 s20, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s18, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s16, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s14, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s12, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s10, v14
+; VI-NEXT: v_readfirstlane_b32 s8, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v16
; VI-NEXT: v_readfirstlane_b32 s6, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_cbranch_scc0 .LBB93_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB93_3
; VI-NEXT: .LBB93_2: ; %cmp.true
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: s_add_i32 s5, s16, 3
-; VI-NEXT: s_and_b32 s8, s17, 0xffff0000
-; VI-NEXT: s_add_i32 s9, s17, 3
-; VI-NEXT: s_and_b32 s10, s18, 0xffff0000
-; VI-NEXT: s_add_i32 s11, s18, 3
-; VI-NEXT: s_and_b32 s12, s19, 0xffff0000
-; VI-NEXT: s_add_i32 s13, s19, 3
-; VI-NEXT: s_and_b32 s14, s20, 0xffff0000
-; VI-NEXT: s_add_i32 s15, s20, 3
-; VI-NEXT: s_and_b32 s16, s21, 0xffff0000
-; VI-NEXT: s_add_i32 s17, s21, 3
-; VI-NEXT: s_and_b32 s18, s22, 0xffff0000
-; VI-NEXT: s_add_i32 s19, s22, 3
-; VI-NEXT: s_and_b32 s20, s23, 0xffff0000
-; VI-NEXT: s_add_i32 s21, s23, 3
-; VI-NEXT: s_and_b32 s22, s24, 0xffff0000
-; VI-NEXT: s_add_i32 s23, s24, 3
-; VI-NEXT: s_and_b32 s24, s25, 0xffff0000
-; VI-NEXT: s_add_i32 s25, s25, 3
-; VI-NEXT: s_and_b32 s40, s26, 0xffff0000
-; VI-NEXT: s_add_i32 s26, s26, 3
-; VI-NEXT: s_and_b32 s41, s27, 0xffff0000
-; VI-NEXT: s_add_i32 s27, s27, 3
-; VI-NEXT: s_and_b32 s42, s28, 0xffff0000
-; VI-NEXT: s_add_i32 s28, s28, 3
-; VI-NEXT: s_and_b32 s43, s29, 0xffff0000
-; VI-NEXT: s_add_i32 s29, s29, 3
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_add_i32 s5, s21, 3
+; VI-NEXT: s_and_b32 s21, s20, 0xffff0000
+; VI-NEXT: s_add_i32 s20, s20, 3
+; VI-NEXT: s_and_b32 s22, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s19, s19, 3
+; VI-NEXT: s_and_b32 s23, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s18, s18, 3
+; VI-NEXT: s_and_b32 s24, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s17, s17, 3
+; VI-NEXT: s_and_b32 s25, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s16, s16, 3
+; VI-NEXT: s_and_b32 s26, s15, 0xffff0000
+; VI-NEXT: s_add_i32 s15, s15, 3
+; VI-NEXT: s_and_b32 s27, s14, 0xffff0000
+; VI-NEXT: s_add_i32 s14, s14, 3
+; VI-NEXT: s_and_b32 s28, s13, 0xffff0000
+; VI-NEXT: s_add_i32 s13, s13, 3
+; VI-NEXT: s_and_b32 s29, s12, 0xffff0000
+; VI-NEXT: s_add_i32 s12, s12, 3
+; VI-NEXT: s_and_b32 s40, s11, 0xffff0000
+; VI-NEXT: s_add_i32 s11, s11, 3
+; VI-NEXT: s_and_b32 s41, s10, 0xffff0000
+; VI-NEXT: s_add_i32 s10, s10, 3
+; VI-NEXT: s_and_b32 s42, s8, 0xffff0000
+; VI-NEXT: s_add_i32 s8, s8, 3
+; VI-NEXT: s_and_b32 s43, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_and_b32 s44, s6, 0xffff0000
; VI-NEXT: s_add_i32 s6, s6, 3
-; VI-NEXT: s_and_b32 s45, s7, 0xffff0000
-; VI-NEXT: s_add_i32 s7, s7, 3
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s45, s9, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s9, 3
+; VI-NEXT: s_and_b32 s9, s9, 0xffff
; VI-NEXT: s_and_b32 s6, s6, 0xffff
-; VI-NEXT: s_and_b32 s29, s29, 0xffff
-; VI-NEXT: s_and_b32 s28, s28, 0xffff
-; VI-NEXT: s_and_b32 s27, s27, 0xffff
-; VI-NEXT: s_and_b32 s26, s26, 0xffff
-; VI-NEXT: s_and_b32 s25, s25, 0xffff
-; VI-NEXT: s_and_b32 s23, s23, 0xffff
-; VI-NEXT: s_and_b32 s21, s21, 0xffff
-; VI-NEXT: s_and_b32 s19, s19, 0xffff
-; VI-NEXT: s_and_b32 s17, s17, 0xffff
-; VI-NEXT: s_and_b32 s15, s15, 0xffff
-; VI-NEXT: s_and_b32 s13, s13, 0xffff
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
; VI-NEXT: s_and_b32 s11, s11, 0xffff
-; VI-NEXT: s_and_b32 s9, s9, 0xffff
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_and_b32 s13, s13, 0xffff
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_and_b32 s15, s15, 0xffff
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_and_b32 s17, s17, 0xffff
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_and_b32 s19, s19, 0xffff
+; VI-NEXT: s_and_b32 s20, s20, 0xffff
; VI-NEXT: s_and_b32 s5, s5, 0xffff
-; VI-NEXT: s_or_b32 s7, s45, s7
+; VI-NEXT: s_or_b32 s9, s45, s9
; VI-NEXT: s_or_b32 s6, s44, s6
-; VI-NEXT: s_or_b32 s29, s43, s29
-; VI-NEXT: s_or_b32 s28, s42, s28
-; VI-NEXT: s_or_b32 s27, s41, s27
-; VI-NEXT: s_or_b32 s26, s40, s26
-; VI-NEXT: s_or_b32 s24, s24, s25
-; VI-NEXT: s_or_b32 s22, s22, s23
-; VI-NEXT: s_or_b32 s20, s20, s21
-; VI-NEXT: s_or_b32 s18, s18, s19
-; VI-NEXT: s_or_b32 s16, s16, s17
-; VI-NEXT: s_or_b32 s14, s14, s15
-; VI-NEXT: s_or_b32 s12, s12, s13
-; VI-NEXT: s_or_b32 s10, s10, s11
-; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_or_b32 s7, s43, s7
+; VI-NEXT: s_or_b32 s8, s42, s8
+; VI-NEXT: s_or_b32 s10, s41, s10
+; VI-NEXT: s_or_b32 s11, s40, s11
+; VI-NEXT: s_or_b32 s12, s29, s12
+; VI-NEXT: s_or_b32 s13, s28, s13
+; VI-NEXT: s_or_b32 s14, s27, s14
+; VI-NEXT: s_or_b32 s15, s26, s15
+; VI-NEXT: s_or_b32 s16, s25, s16
+; VI-NEXT: s_or_b32 s17, s24, s17
+; VI-NEXT: s_or_b32 s18, s23, s18
+; VI-NEXT: s_or_b32 s19, s22, s19
+; VI-NEXT: s_or_b32 s20, s21, s20
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s7, s7, 0x30000
+; VI-NEXT: s_add_i32 s9, s9, 0x30000
; VI-NEXT: s_add_i32 s6, s6, 0x30000
-; VI-NEXT: s_add_i32 s29, s29, 0x30000
-; VI-NEXT: s_add_i32 s28, s28, 0x30000
-; VI-NEXT: s_add_i32 s27, s27, 0x30000
-; VI-NEXT: s_add_i32 s26, s26, 0x30000
-; VI-NEXT: s_add_i32 s25, s24, 0x30000
-; VI-NEXT: s_add_i32 s24, s22, 0x30000
-; VI-NEXT: s_add_i32 s23, s20, 0x30000
-; VI-NEXT: s_add_i32 s22, s18, 0x30000
-; VI-NEXT: s_add_i32 s21, s16, 0x30000
-; VI-NEXT: s_add_i32 s20, s14, 0x30000
-; VI-NEXT: s_add_i32 s19, s12, 0x30000
-; VI-NEXT: s_add_i32 s18, s10, 0x30000
-; VI-NEXT: s_add_i32 s17, s8, 0x30000
-; VI-NEXT: s_add_i32 s16, s4, 0x30000
+; VI-NEXT: s_add_i32 s7, s7, 0x30000
+; VI-NEXT: s_add_i32 s8, s8, 0x30000
+; VI-NEXT: s_add_i32 s10, s10, 0x30000
+; VI-NEXT: s_add_i32 s11, s11, 0x30000
+; VI-NEXT: s_add_i32 s12, s12, 0x30000
+; VI-NEXT: s_add_i32 s13, s13, 0x30000
+; VI-NEXT: s_add_i32 s14, s14, 0x30000
+; VI-NEXT: s_add_i32 s15, s15, 0x30000
+; VI-NEXT: s_add_i32 s16, s16, 0x30000
+; VI-NEXT: s_add_i32 s17, s17, 0x30000
+; VI-NEXT: s_add_i32 s18, s18, 0x30000
+; VI-NEXT: s_add_i32 s19, s19, 0x30000
+; VI-NEXT: s_add_i32 s20, s20, 0x30000
+; VI-NEXT: s_add_i32 s21, s4, 0x30000
; VI-NEXT: .LBB93_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
+; VI-NEXT: v_mov_b32_e32 v0, s21
+; VI-NEXT: v_mov_b32_e32 v1, s20
+; VI-NEXT: v_mov_b32_e32 v2, s19
+; VI-NEXT: v_mov_b32_e32 v3, s18
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s16
+; VI-NEXT: v_mov_b32_e32 v6, s15
+; VI-NEXT: v_mov_b32_e32 v7, s14
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mov_b32_e32 v9, s12
+; VI-NEXT: v_mov_b32_e32 v10, s11
+; VI-NEXT: v_mov_b32_e32 v11, s10
+; VI-NEXT: v_mov_b32_e32 v12, s8
+; VI-NEXT: v_mov_b32_e32 v13, s7
; VI-NEXT: v_mov_b32_e32 v14, s6
-; VI-NEXT: v_mov_b32_e32 v15, s7
+; VI-NEXT: v_mov_b32_e32 v15, s9
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB93_4:
; VI-NEXT: s_branch .LBB93_2
@@ -61971,189 +62671,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28
-; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29
; SI-NEXT: s_cbranch_scc0 .LBB95_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31
; SI-NEXT: s_cbranch_execnz .LBB95_3
; SI-NEXT: .LBB95_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53
+; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
-; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10
+; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13
+; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41
-; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35
+; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16
+; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37
+; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
+; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14
+; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16
+; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16
+; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16
+; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16
+; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28
-; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16
-; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16
-; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16
-; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16
-; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16
-; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16
-; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16
-; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16
-; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16
+; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16
+; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16
+; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16
; SI-NEXT: .LBB95_3: ; %end
+; SI-NEXT: v_mov_b32_e32 v5, v40
+; SI-NEXT: v_mov_b32_e32 v9, v41
+; SI-NEXT: v_mov_b32_e32 v13, v42
+; SI-NEXT: v_mov_b32_e32 v17, v43
+; SI-NEXT: v_mov_b32_e32 v21, v44
+; SI-NEXT: v_mov_b32_e32 v25, v45
+; SI-NEXT: v_mov_b32_e32 v29, v46
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -62170,695 +62885,395 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v1, v33
-; SI-NEXT: v_mov_b32_e32 v5, v34
-; SI-NEXT: v_mov_b32_e32 v9, v35
-; SI-NEXT: v_mov_b32_e32 v13, v36
-; SI-NEXT: v_mov_b32_e32 v17, v37
-; SI-NEXT: v_mov_b32_e32 v21, v38
-; SI-NEXT: v_mov_b32_e32 v25, v50
-; SI-NEXT: v_mov_b32_e32 v29, v48
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v1, v54
+; SI-NEXT: v_mov_b32_e32 v3, v53
+; SI-NEXT: v_mov_b32_e32 v7, v51
+; SI-NEXT: v_mov_b32_e32 v11, v49
+; SI-NEXT: v_mov_b32_e32 v15, v38
+; SI-NEXT: v_mov_b32_e32 v19, v39
+; SI-NEXT: v_mov_b32_e32 v23, v36
+; SI-NEXT: v_mov_b32_e32 v27, v34
+; SI-NEXT: v_mov_b32_e32 v31, v32
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB95_4:
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: s_branch .LBB95_2
;
; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v20, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v22, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v21, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v20, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v19, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v18, s25
+; VI-NEXT: v_mov_b32_e32 v17, s27
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB95_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB95_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB95_4
+; VI-NEXT: s_cbranch_execnz .LBB95_3
; VI-NEXT: .LBB95_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v0, s4, v1
-; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: s_lshl_b32 s5, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s5, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s5, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_lshl_b32 s5, s31, 16
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s5, v1
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: s_and_b32 s5, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s5, v1
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v16, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v1
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v1
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_add_f32_e32 v17, s4, v1
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v18, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_bfe_u32 v13, v10, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v28, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc
+; VI-NEXT: v_bfe_u32 v28, v17, 16, 1
+; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
+; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16
-; VI-NEXT: s_branch .LBB95_5
-; VI-NEXT: .LBB95_3:
-; VI-NEXT: s_branch .LBB95_2
-; VI-NEXT: .LBB95_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB95_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v20, 1
-; VI-NEXT: v_readlane_b32 s30, v20, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_bfe_u32 v17, v12, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; VI-NEXT: v_bfe_u32 v30, v17, 16, 1
+; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc
+; VI-NEXT: v_bfe_u32 v30, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
+; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_bfe_u32 v31, v14, 16, 1
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc
+; VI-NEXT: v_bfe_u32 v31, v30, 16, 1
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_bfe_u32 v32, v14, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14
+; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25]
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_mov_b32_e32 v21, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_mov_b32_e32 v19, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29]
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_mov_b32_e32 v17, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15]
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31]
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13]
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11]
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9]
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v15, v23
+; VI-NEXT: .LBB95_3: ; %end
+; VI-NEXT: v_mov_b32_e32 v1, v22
+; VI-NEXT: v_mov_b32_e32 v3, v21
+; VI-NEXT: v_mov_b32_e32 v5, v20
+; VI-NEXT: v_mov_b32_e32 v7, v19
+; VI-NEXT: v_mov_b32_e32 v9, v18
+; VI-NEXT: v_mov_b32_e32 v11, v17
+; VI-NEXT: v_mov_b32_e32 v13, v16
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB95_4:
+; VI-NEXT: s_branch .LBB95_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v32i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB95_4
-; GFX9-NEXT: .LBB95_2: ; %cmp.true
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v1, s5, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s5, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s5, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s5, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s5, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s5, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
-; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
-; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
-; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0
-; GFX9-NEXT: s_branch .LBB95_5
-; GFX9-NEXT: .LBB95_3:
-; GFX9-NEXT: s_branch .LBB95_2
-; GFX9-NEXT: .LBB95_4:
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
@@ -62872,17 +63287,305 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v10, s26
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB95_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_cbranch_scc0 .LBB95_4
+; GFX9-NEXT: ; %bb.1: ; %cmp.false
+; GFX9-NEXT: s_cbranch_execnz .LBB95_3
+; GFX9-NEXT: .LBB95_2: ; %cmp.true
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v0
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v19, v19, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v19, v19, v2
+; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v20, v20, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20
+; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v20, v20, v3
+; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20
+; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v21, v21, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21
+; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v21, v21, v4
+; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21
+; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v22, v22, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22
+; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v22, v22, v5
+; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22
+; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v23, v23, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23
+; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v23, v23, v6
+; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23
+; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v24, v24, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v24, v24, v7
+; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v25, v25, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v25, v25, v8
+; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc
+; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v26, v26, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v26, v26, v9
+; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc
+; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v27, v27, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v27, v27, v10
+; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc
+; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v28, v28, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v28, v28, v11
+; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc
+; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v29, v29, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29
+; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v29, v29, v12
+; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29
+; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc
+; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v30, v30, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30
+; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc
+; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v30, v30, v13
+; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30
+; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc
+; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v31, v31, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31
+; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc
+; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v31, v31, v14
+; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31
+; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc
+; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v32, v32, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v32, v32, v15
+; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff0000
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_and_or_b32 v15, v31, v32, v15
+; GFX9-NEXT: v_and_or_b32 v14, v30, v32, v14
+; GFX9-NEXT: v_and_or_b32 v13, v29, v32, v13
+; GFX9-NEXT: v_and_or_b32 v12, v28, v32, v12
+; GFX9-NEXT: v_and_or_b32 v11, v27, v32, v11
+; GFX9-NEXT: v_and_or_b32 v10, v26, v32, v10
+; GFX9-NEXT: v_and_or_b32 v9, v25, v32, v9
+; GFX9-NEXT: v_and_or_b32 v8, v24, v32, v8
+; GFX9-NEXT: v_and_or_b32 v7, v23, v32, v7
+; GFX9-NEXT: v_and_or_b32 v6, v22, v32, v6
+; GFX9-NEXT: v_and_or_b32 v5, v21, v32, v5
+; GFX9-NEXT: v_and_or_b32 v4, v20, v32, v4
+; GFX9-NEXT: v_and_or_b32 v3, v19, v32, v3
+; GFX9-NEXT: v_and_or_b32 v2, v18, v32, v2
+; GFX9-NEXT: v_and_or_b32 v1, v17, v32, v1
+; GFX9-NEXT: v_and_or_b32 v0, v16, v32, v0
+; GFX9-NEXT: .LBB95_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB95_4:
+; GFX9-NEXT: s_branch .LBB95_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -64301,6 +65004,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62
+; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -64317,18 +65032,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -64780,6 +65483,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -64796,8 +65501,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -65069,25 +65772,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -65114,6 +65801,22 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -66382,32 +67085,60 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v4, s30, 0
-; VI-NEXT: v_writelane_b32 v4, s31, 1
-; VI-NEXT: v_writelane_b32 v4, s34, 2
-; VI-NEXT: v_writelane_b32 v4, s35, 3
-; VI-NEXT: v_writelane_b32 v4, s36, 4
-; VI-NEXT: v_writelane_b32 v4, s37, 5
-; VI-NEXT: v_writelane_b32 v4, s38, 6
-; VI-NEXT: v_writelane_b32 v4, s39, 7
-; VI-NEXT: v_writelane_b32 v4, s48, 8
-; VI-NEXT: v_writelane_b32 v4, s49, 9
-; VI-NEXT: v_writelane_b32 v4, s50, 10
-; VI-NEXT: v_writelane_b32 v4, s51, 11
-; VI-NEXT: v_writelane_b32 v4, s52, 12
-; VI-NEXT: v_writelane_b32 v4, s53, 13
-; VI-NEXT: v_writelane_b32 v4, s54, 14
-; VI-NEXT: v_writelane_b32 v4, s55, 15
-; VI-NEXT: v_writelane_b32 v4, s64, 16
-; VI-NEXT: v_writelane_b32 v4, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s30, 0
+; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_writelane_b32 v18, s34, 2
+; VI-NEXT: v_writelane_b32 v18, s35, 3
+; VI-NEXT: v_writelane_b32 v18, s36, 4
+; VI-NEXT: v_writelane_b32 v18, s37, 5
+; VI-NEXT: v_writelane_b32 v18, s38, 6
+; VI-NEXT: v_writelane_b32 v18, s39, 7
+; VI-NEXT: v_writelane_b32 v18, s48, 8
+; VI-NEXT: v_writelane_b32 v18, s49, 9
+; VI-NEXT: v_writelane_b32 v18, s50, 10
+; VI-NEXT: v_writelane_b32 v18, s51, 11
+; VI-NEXT: v_writelane_b32 v18, s52, 12
+; VI-NEXT: v_writelane_b32 v18, s53, 13
+; VI-NEXT: v_writelane_b32 v18, s54, 14
+; VI-NEXT: v_writelane_b32 v18, s55, 15
+; VI-NEXT: v_writelane_b32 v18, s64, 16
+; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v4, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v4, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s67, 19
; VI-NEXT: s_cbranch_scc0 .LBB97_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -66415,351 +67146,351 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB97_3
; VI-NEXT: .LBB97_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s7, s17, 3
-; VI-NEXT: s_and_b32 s6, s17, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s16, 3
-; VI-NEXT: s_add_i32 s17, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s19, 3
-; VI-NEXT: s_add_i32 s16, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s19, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s18, 3
-; VI-NEXT: s_add_i32 s19, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s21, 3
-; VI-NEXT: s_add_i32 s18, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s21, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s20, 3
-; VI-NEXT: s_add_i32 s21, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s23, 3
-; VI-NEXT: s_add_i32 s20, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s22, 3
-; VI-NEXT: s_add_i32 s23, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s25, 3
-; VI-NEXT: s_add_i32 s22, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s24, 3
-; VI-NEXT: s_add_i32 s25, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s27, 3
-; VI-NEXT: s_add_i32 s24, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s26, 3
-; VI-NEXT: s_add_i32 s27, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s29, 3
-; VI-NEXT: s_add_i32 s26, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s7, s28, 3
-; VI-NEXT: s_add_i32 s29, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s28, 0xffff0000
+; VI-NEXT: s_and_b32 s20, s19, 0xffff0000
+; VI-NEXT: s_add_i32 s19, s19, 3
+; VI-NEXT: s_and_b32 s19, s19, 0xffff
+; VI-NEXT: s_or_b32 s19, s20, s19
+; VI-NEXT: s_and_b32 s20, s18, 0xffff0000
+; VI-NEXT: s_add_i32 s18, s18, 3
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_or_b32 s18, s20, s18
+; VI-NEXT: s_and_b32 s20, s17, 0xffff0000
+; VI-NEXT: s_add_i32 s17, s17, 3
+; VI-NEXT: s_and_b32 s17, s17, 0xffff
+; VI-NEXT: s_or_b32 s17, s20, s17
+; VI-NEXT: s_and_b32 s20, s16, 0xffff0000
+; VI-NEXT: s_add_i32 s16, s16, 3
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_or_b32 s16, s20, s16
+; VI-NEXT: s_and_b32 s20, s15, 0xffff0000
+; VI-NEXT: s_add_i32 s15, s15, 3
+; VI-NEXT: s_and_b32 s15, s15, 0xffff
+; VI-NEXT: s_or_b32 s15, s20, s15
+; VI-NEXT: s_and_b32 s20, s14, 0xffff0000
+; VI-NEXT: s_add_i32 s14, s14, 3
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_or_b32 s14, s20, s14
+; VI-NEXT: s_and_b32 s20, s13, 0xffff0000
+; VI-NEXT: s_add_i32 s13, s13, 3
+; VI-NEXT: s_and_b32 s13, s13, 0xffff
+; VI-NEXT: s_or_b32 s13, s20, s13
+; VI-NEXT: s_and_b32 s20, s12, 0xffff0000
+; VI-NEXT: s_add_i32 s12, s12, 3
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_or_b32 s12, s20, s12
+; VI-NEXT: s_and_b32 s20, s11, 0xffff0000
+; VI-NEXT: s_add_i32 s11, s11, 3
+; VI-NEXT: s_and_b32 s11, s11, 0xffff
+; VI-NEXT: s_or_b32 s11, s20, s11
+; VI-NEXT: s_and_b32 s20, s10, 0xffff0000
+; VI-NEXT: s_add_i32 s10, s10, 3
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_or_b32 s10, s20, s10
+; VI-NEXT: s_and_b32 s20, s9, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s9, 3
+; VI-NEXT: s_and_b32 s9, s9, 0xffff
+; VI-NEXT: s_or_b32 s9, s20, s9
+; VI-NEXT: s_and_b32 s20, s8, 0xffff0000
+; VI-NEXT: s_add_i32 s8, s8, 3
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_or_b32 s8, s20, s8
+; VI-NEXT: s_and_b32 s20, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_or_b32 s6, s6, s7
-; VI-NEXT: s_add_i32 s28, s6, 0x30000
-; VI-NEXT: s_and_b32 s6, s5, 0xffff0000
+; VI-NEXT: s_or_b32 s7, s20, s7
+; VI-NEXT: s_and_b32 s20, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s6, s6, 3
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_or_b32 s6, s20, s6
+; VI-NEXT: s_and_b32 s20, s5, 0xffff0000
; VI-NEXT: s_add_i32 s5, s5, 3
; VI-NEXT: s_and_b32 s5, s5, 0xffff
-; VI-NEXT: s_or_b32 s5, s6, s5
-; VI-NEXT: s_and_b32 s6, s4, 0xffff0000
+; VI-NEXT: s_or_b32 s5, s20, s5
+; VI-NEXT: s_and_b32 s20, s4, 0xffff0000
; VI-NEXT: s_add_i32 s4, s4, 3
; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: s_or_b32 s4, s6, s4
+; VI-NEXT: s_or_b32 s4, s20, s4
+; VI-NEXT: s_add_i32 s19, s19, 0x30000
+; VI-NEXT: s_add_i32 s18, s18, 0x30000
+; VI-NEXT: s_add_i32 s17, s17, 0x30000
+; VI-NEXT: s_add_i32 s16, s16, 0x30000
+; VI-NEXT: s_add_i32 s15, s15, 0x30000
+; VI-NEXT: s_add_i32 s14, s14, 0x30000
+; VI-NEXT: s_add_i32 s13, s13, 0x30000
+; VI-NEXT: s_add_i32 s12, s12, 0x30000
+; VI-NEXT: s_add_i32 s11, s11, 0x30000
+; VI-NEXT: s_add_i32 s10, s10, 0x30000
+; VI-NEXT: s_add_i32 s9, s9, 0x30000
+; VI-NEXT: s_add_i32 s8, s8, 0x30000
+; VI-NEXT: s_add_i32 s7, s7, 0x30000
+; VI-NEXT: s_add_i32 s6, s6, 0x30000
; VI-NEXT: s_add_i32 s5, s5, 0x30000
; VI-NEXT: s_add_i32 s4, s4, 0x30000
-; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; VI-NEXT: s_lshr_b32 s56, s5, 24
; VI-NEXT: s_lshr_b32 s57, s5, 16
; VI-NEXT: s_lshr_b32 s58, s5, 8
; VI-NEXT: s_lshr_b32 s59, s4, 16
; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s63, s29, 8
-; VI-NEXT: s_lshr_b32 s72, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s76, s27, 8
-; VI-NEXT: s_lshr_b32 s77, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s89, s25, 8
-; VI-NEXT: s_lshr_b32 s90, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s34, s23, 8
-; VI-NEXT: s_lshr_b32 s35, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s39, s21, 8
-; VI-NEXT: s_lshr_b32 s48, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s52, s19, 8
-; VI-NEXT: s_lshr_b32 s53, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s65, s17, 8
-; VI-NEXT: s_lshr_b32 s66, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
+; VI-NEXT: s_lshr_b32 s61, s7, 24
+; VI-NEXT: s_lshr_b32 s62, s7, 16
+; VI-NEXT: s_lshr_b32 s63, s7, 8
+; VI-NEXT: s_lshr_b32 s72, s6, 16
+; VI-NEXT: s_lshr_b32 s73, s6, 8
+; VI-NEXT: s_lshr_b32 s74, s9, 24
+; VI-NEXT: s_lshr_b32 s75, s9, 16
+; VI-NEXT: s_lshr_b32 s76, s9, 8
+; VI-NEXT: s_lshr_b32 s77, s8, 16
+; VI-NEXT: s_lshr_b32 s78, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s88, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s11, 8
+; VI-NEXT: s_lshr_b32 s90, s10, 16
+; VI-NEXT: s_lshr_b32 s91, s10, 8
+; VI-NEXT: s_lshr_b32 s30, s13, 24
+; VI-NEXT: s_lshr_b32 s31, s13, 16
+; VI-NEXT: s_lshr_b32 s34, s13, 8
+; VI-NEXT: s_lshr_b32 s35, s12, 16
+; VI-NEXT: s_lshr_b32 s36, s12, 8
+; VI-NEXT: s_lshr_b32 s37, s15, 24
+; VI-NEXT: s_lshr_b32 s38, s15, 16
+; VI-NEXT: s_lshr_b32 s39, s15, 8
+; VI-NEXT: s_lshr_b32 s48, s14, 16
+; VI-NEXT: s_lshr_b32 s49, s14, 8
+; VI-NEXT: s_lshr_b32 s50, s17, 24
+; VI-NEXT: s_lshr_b32 s51, s17, 16
+; VI-NEXT: s_lshr_b32 s52, s17, 8
+; VI-NEXT: s_lshr_b32 s53, s16, 16
+; VI-NEXT: s_lshr_b32 s54, s16, 8
+; VI-NEXT: s_lshr_b32 s55, s19, 24
+; VI-NEXT: s_lshr_b32 s64, s19, 16
+; VI-NEXT: s_lshr_b32 s65, s19, 8
+; VI-NEXT: s_lshr_b32 s66, s18, 16
+; VI-NEXT: s_lshr_b32 s67, s18, 8
; VI-NEXT: .LBB97_3: ; %end
-; VI-NEXT: s_and_b32 s7, s16, 0xff
-; VI-NEXT: s_lshl_b32 s9, s67, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s66, 0xff
-; VI-NEXT: s_lshl_b32 s11, s44, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_and_b32 s7, s17, 0xff
-; VI-NEXT: s_lshl_b32 s9, s65, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s64, 0xff
-; VI-NEXT: s_lshl_b32 s11, s55, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s18, 0xff
-; VI-NEXT: s_lshl_b32 s9, s54, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s53, 0xff
-; VI-NEXT: s_lshl_b32 s11, s42, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: s_and_b32 s18, s18, 0xff
+; VI-NEXT: s_lshl_b32 s21, s67, 8
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: s_and_b32 s21, s66, 0xff
+; VI-NEXT: s_lshl_b32 s23, s44, 8
+; VI-NEXT: s_or_b32 s21, s21, s23
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s21, s21, 16
+; VI-NEXT: s_or_b32 s18, s18, s21
+; VI-NEXT: v_mov_b32_e32 v1, s18
+; VI-NEXT: s_and_b32 s18, s19, 0xff
+; VI-NEXT: s_lshl_b32 s19, s65, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s19, s64, 0xff
+; VI-NEXT: s_lshl_b32 s21, s55, 8
+; VI-NEXT: s_or_b32 s19, s19, s21
+; VI-NEXT: s_and_b32 s18, s18, 0xffff
+; VI-NEXT: s_lshl_b32 s19, s19, 16
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xff
+; VI-NEXT: s_lshl_b32 s18, s54, 8
+; VI-NEXT: s_or_b32 s16, s16, s18
+; VI-NEXT: s_and_b32 s18, s53, 0xff
+; VI-NEXT: s_lshl_b32 s19, s42, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s18, s18, 16
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s18
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s19, 0xff
-; VI-NEXT: s_lshl_b32 s9, s52, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s51, 0xff
-; VI-NEXT: s_lshl_b32 s11, s50, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s16, s17, 0xff
+; VI-NEXT: s_lshl_b32 s17, s52, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s17, s51, 0xff
+; VI-NEXT: s_lshl_b32 s18, s50, 8
+; VI-NEXT: s_or_b32 s17, s17, s18
+; VI-NEXT: s_and_b32 s16, s16, 0xffff
+; VI-NEXT: s_lshl_b32 s17, s17, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s16, s16, s17
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s20, 0xff
-; VI-NEXT: s_lshl_b32 s9, s49, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s48, 0xff
-; VI-NEXT: s_lshl_b32 s11, s40, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xff
+; VI-NEXT: s_lshl_b32 s16, s49, 8
+; VI-NEXT: s_or_b32 s14, s14, s16
+; VI-NEXT: s_and_b32 s16, s48, 0xff
+; VI-NEXT: s_lshl_b32 s17, s40, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s16, s16, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s16
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s21, 0xff
-; VI-NEXT: s_lshl_b32 s9, s39, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s38, 0xff
-; VI-NEXT: s_lshl_b32 s11, s37, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s14, s15, 0xff
+; VI-NEXT: s_lshl_b32 s15, s39, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s15, s38, 0xff
+; VI-NEXT: s_lshl_b32 s16, s37, 8
+; VI-NEXT: s_or_b32 s15, s15, s16
+; VI-NEXT: s_and_b32 s14, s14, 0xffff
+; VI-NEXT: s_lshl_b32 s15, s15, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s14, s14, s15
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s22, 0xff
-; VI-NEXT: s_lshl_b32 s9, s36, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s35, 0xff
-; VI-NEXT: s_lshl_b32 s11, s14, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xff
+; VI-NEXT: s_lshl_b32 s14, s36, 8
+; VI-NEXT: s_or_b32 s12, s12, s14
+; VI-NEXT: s_and_b32 s14, s35, 0xff
+; VI-NEXT: s_lshl_b32 s15, s28, 8
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s14, s14, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s14
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s23, 0xff
-; VI-NEXT: s_lshl_b32 s9, s34, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s31, 0xff
-; VI-NEXT: s_lshl_b32 s11, s30, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s12, s13, 0xff
+; VI-NEXT: s_lshl_b32 s13, s34, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s13, s31, 0xff
+; VI-NEXT: s_lshl_b32 s14, s30, 8
+; VI-NEXT: s_or_b32 s13, s13, s14
+; VI-NEXT: s_and_b32 s12, s12, 0xffff
+; VI-NEXT: s_lshl_b32 s13, s13, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s12, s12, s13
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s24, 0xff
-; VI-NEXT: s_lshl_b32 s9, s91, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s90, 0xff
-; VI-NEXT: s_lshl_b32 s11, s12, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xff
+; VI-NEXT: s_lshl_b32 s12, s91, 8
+; VI-NEXT: s_or_b32 s10, s10, s12
+; VI-NEXT: s_and_b32 s12, s90, 0xff
+; VI-NEXT: s_lshl_b32 s13, s26, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s12, s12, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s12
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s25, 0xff
-; VI-NEXT: s_lshl_b32 s9, s89, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s88, 0xff
-; VI-NEXT: s_lshl_b32 s11, s79, 8
-; VI-NEXT: s_or_b32 s9, s9, s11
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s10, s11, 0xff
+; VI-NEXT: s_lshl_b32 s11, s89, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s11, s88, 0xff
+; VI-NEXT: s_lshl_b32 s12, s79, 8
+; VI-NEXT: s_or_b32 s11, s11, s12
+; VI-NEXT: s_and_b32 s10, s10, 0xffff
+; VI-NEXT: s_lshl_b32 s11, s11, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s10, s10, s11
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s26, 0xff
-; VI-NEXT: s_lshl_b32 s9, s78, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s77, 0xff
-; VI-NEXT: s_lshl_b32 s10, s10, 8
-; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: s_and_b32 s8, s8, 0xff
+; VI-NEXT: s_lshl_b32 s10, s78, 8
+; VI-NEXT: s_or_b32 s8, s8, s10
+; VI-NEXT: s_and_b32 s10, s77, 0xff
+; VI-NEXT: s_lshl_b32 s11, s24, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
+; VI-NEXT: s_lshl_b32 s10, s10, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s10
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s27, 0xff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s8, s9, 0xff
; VI-NEXT: s_lshl_b32 s9, s76, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: s_and_b32 s9, s75, 0xff
; VI-NEXT: s_lshl_b32 s10, s74, 8
; VI-NEXT: s_or_b32 s9, s9, s10
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_lshl_b32 s9, s9, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0
-; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s28, 0xff
-; VI-NEXT: s_lshl_b32 s9, s73, 8
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_and_b32 s9, s72, 0xff
-; VI-NEXT: s_lshl_b32 s8, s8, 8
-; VI-NEXT: s_or_b32 s8, s9, s8
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xff
+; VI-NEXT: s_lshl_b32 s8, s73, 8
+; VI-NEXT: s_or_b32 s6, s6, s8
+; VI-NEXT: s_and_b32 s8, s72, 0xff
+; VI-NEXT: s_lshl_b32 s9, s22, 8
+; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
; VI-NEXT: s_lshl_b32 s8, s8, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s8
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_and_b32 s7, s29, 0xff
-; VI-NEXT: s_lshl_b32 s8, s63, 8
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_and_b32 s6, s7, 0xff
+; VI-NEXT: s_lshl_b32 s7, s63, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s62, 0xff
+; VI-NEXT: s_lshl_b32 s8, s61, 8
; VI-NEXT: s_or_b32 s7, s7, s8
-; VI-NEXT: s_and_b32 s8, s62, 0xff
-; VI-NEXT: s_lshl_b32 s9, s61, 8
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0
-; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_and_b32 s4, s4, 0xff
-; VI-NEXT: s_lshl_b32 s7, s60, 8
-; VI-NEXT: s_or_b32 s4, s4, s7
-; VI-NEXT: s_and_b32 s7, s59, 0xff
-; VI-NEXT: s_lshl_b32 s6, s6, 8
-; VI-NEXT: s_or_b32 s6, s7, s6
+; VI-NEXT: s_lshl_b32 s6, s60, 8
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: s_and_b32 s6, s59, 0xff
+; VI-NEXT: s_lshl_b32 s7, s20, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0
@@ -66780,28 +67511,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v4, 19
-; VI-NEXT: v_readlane_b32 s66, v4, 18
-; VI-NEXT: v_readlane_b32 s65, v4, 17
-; VI-NEXT: v_readlane_b32 s64, v4, 16
-; VI-NEXT: v_readlane_b32 s55, v4, 15
-; VI-NEXT: v_readlane_b32 s54, v4, 14
-; VI-NEXT: v_readlane_b32 s53, v4, 13
-; VI-NEXT: v_readlane_b32 s52, v4, 12
-; VI-NEXT: v_readlane_b32 s51, v4, 11
-; VI-NEXT: v_readlane_b32 s50, v4, 10
-; VI-NEXT: v_readlane_b32 s49, v4, 9
-; VI-NEXT: v_readlane_b32 s48, v4, 8
-; VI-NEXT: v_readlane_b32 s39, v4, 7
-; VI-NEXT: v_readlane_b32 s38, v4, 6
-; VI-NEXT: v_readlane_b32 s37, v4, 5
-; VI-NEXT: v_readlane_b32 s36, v4, 4
-; VI-NEXT: v_readlane_b32 s35, v4, 3
-; VI-NEXT: v_readlane_b32 s34, v4, 2
-; VI-NEXT: v_readlane_b32 s31, v4, 1
-; VI-NEXT: v_readlane_b32 s30, v4, 0
+; VI-NEXT: v_readlane_b32 s67, v18, 19
+; VI-NEXT: v_readlane_b32 s66, v18, 18
+; VI-NEXT: v_readlane_b32 s65, v18, 17
+; VI-NEXT: v_readlane_b32 s64, v18, 16
+; VI-NEXT: v_readlane_b32 s55, v18, 15
+; VI-NEXT: v_readlane_b32 s54, v18, 14
+; VI-NEXT: v_readlane_b32 s53, v18, 13
+; VI-NEXT: v_readlane_b32 s52, v18, 12
+; VI-NEXT: v_readlane_b32 s51, v18, 11
+; VI-NEXT: v_readlane_b32 s50, v18, 10
+; VI-NEXT: v_readlane_b32 s49, v18, 9
+; VI-NEXT: v_readlane_b32 s48, v18, 8
+; VI-NEXT: v_readlane_b32 s39, v18, 7
+; VI-NEXT: v_readlane_b32 s38, v18, 6
+; VI-NEXT: v_readlane_b32 s37, v18, 5
+; VI-NEXT: v_readlane_b32 s36, v18, 4
+; VI-NEXT: v_readlane_b32 s35, v18, 3
+; VI-NEXT: v_readlane_b32 s34, v18, 2
+; VI-NEXT: v_readlane_b32 s31, v18, 1
+; VI-NEXT: v_readlane_b32 s30, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -66826,31 +67557,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr36
; VI-NEXT: ; implicit-def: $sgpr35
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr31
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr90
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr79
; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr73
; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: ; implicit-def: $sgpr62
; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: ; implicit-def: $sgpr60
; VI-NEXT: ; implicit-def: $sgpr59
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr20
; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: ; implicit-def: $sgpr57
; VI-NEXT: ; implicit-def: $sgpr56
@@ -66877,10 +67608,38 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v63, s52, 12
; GFX9-NEXT: v_writelane_b32 v63, s53, 13
; GFX9-NEXT: v_writelane_b32 v63, s54, 14
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX9-NEXT: v_writelane_b32 v63, s55, 15
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -66904,76 +67663,76 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s59, s5, 8
; GFX9-NEXT: s_lshr_b32 s58, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s72, s29, 8
-; GFX9-NEXT: s_lshr_b32 s63, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s77, s27, 8
-; GFX9-NEXT: s_lshr_b32 s76, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s90, s25, 8
-; GFX9-NEXT: s_lshr_b32 s89, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s95, s23, 8
-; GFX9-NEXT: s_lshr_b32 s94, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s36, s21, 8
-; GFX9-NEXT: s_lshr_b32 s35, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s49, s19, 8
-; GFX9-NEXT: s_lshr_b32 s48, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s54, s17, 8
-; GFX9-NEXT: s_lshr_b32 s53, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s72, s7, 8
+; GFX9-NEXT: s_lshr_b32 s63, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s77, s9, 8
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s90, s11, 8
+; GFX9-NEXT: s_lshr_b32 s89, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s95, s13, 8
+; GFX9-NEXT: s_lshr_b32 s94, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s36, s15, 8
+; GFX9-NEXT: s_lshr_b32 s35, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s49, s17, 8
+; GFX9-NEXT: s_lshr_b32 s48, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s54, s19, 8
+; GFX9-NEXT: s_lshr_b32 s53, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB97_4
; GFX9-NEXT: .LBB97_2: ; %cmp.true
-; GFX9-NEXT: v_pk_add_u16 v6, s27, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v5, s26, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2]
; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6]
-; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8]
-; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10]
-; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12]
-; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v20, s19, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v19, s18, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -67023,31 +67782,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: .LBB97_3:
; GFX9-NEXT: ; implicit-def: $sgpr55
; GFX9-NEXT: ; implicit-def: $sgpr53
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr54
; GFX9-NEXT: ; implicit-def: $sgpr52
; GFX9-NEXT: ; implicit-def: $sgpr51
; GFX9-NEXT: ; implicit-def: $sgpr50
; GFX9-NEXT: ; implicit-def: $sgpr48
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr49
; GFX9-NEXT: ; implicit-def: $sgpr39
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr37
; GFX9-NEXT: ; implicit-def: $sgpr35
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr31
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr94
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr89
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
@@ -67076,20 +67835,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s42
-; GFX9-NEXT: v_mov_b32_e32 v19, s16
-; GFX9-NEXT: v_mov_b32_e32 v20, s17
-; GFX9-NEXT: v_mov_b32_e32 v15, s18
-; GFX9-NEXT: v_mov_b32_e32 v16, s19
-; GFX9-NEXT: v_mov_b32_e32 v11, s20
-; GFX9-NEXT: v_mov_b32_e32 v12, s21
-; GFX9-NEXT: v_mov_b32_e32 v9, s22
-; GFX9-NEXT: v_mov_b32_e32 v10, s23
-; GFX9-NEXT: v_mov_b32_e32 v7, s24
-; GFX9-NEXT: v_mov_b32_e32 v8, s25
-; GFX9-NEXT: v_mov_b32_e32 v5, s26
-; GFX9-NEXT: v_mov_b32_e32 v6, s27
-; GFX9-NEXT: v_mov_b32_e32 v3, s28
-; GFX9-NEXT: v_mov_b32_e32 v4, s29
+; GFX9-NEXT: v_mov_b32_e32 v19, s18
+; GFX9-NEXT: v_mov_b32_e32 v20, s19
+; GFX9-NEXT: v_mov_b32_e32 v15, s16
+; GFX9-NEXT: v_mov_b32_e32 v16, s17
+; GFX9-NEXT: v_mov_b32_e32 v11, s14
+; GFX9-NEXT: v_mov_b32_e32 v12, s15
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: v_mov_b32_e32 v10, s13
+; GFX9-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v17, s55
@@ -67132,15 +67891,15 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v27, s59
; GFX9-NEXT: v_mov_b32_e32 v14, s57
; GFX9-NEXT: v_mov_b32_e32 v18, s56
-; GFX9-NEXT: v_mov_b32_e32 v23, s12
-; GFX9-NEXT: v_mov_b32_e32 v24, s10
-; GFX9-NEXT: v_mov_b32_e32 v25, s8
-; GFX9-NEXT: v_mov_b32_e32 v26, s6
+; GFX9-NEXT: v_mov_b32_e32 v23, s26
+; GFX9-NEXT: v_mov_b32_e32 v24, s24
+; GFX9-NEXT: v_mov_b32_e32 v25, s22
+; GFX9-NEXT: v_mov_b32_e32 v26, s20
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s40
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s28
; GFX9-NEXT: .LBB97_5: ; %end
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -67218,21 +67977,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -67249,7 +67993,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(16)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -67275,6 +68019,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -74852,663 +75611,340 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_writelane_b32 v20, s31, 1
-; VI-NEXT: v_readfirstlane_b32 s30, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; VI-NEXT: v_mov_b32_e32 v15, v1
+; VI-NEXT: v_mov_b32_e32 v14, v0
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v22, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v21, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v20, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v19, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v18, s25
+; VI-NEXT: v_mov_b32_e32 v17, s27
+; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s31, v1
-; VI-NEXT: s_cbranch_scc0 .LBB103_3
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: s_cbranch_scc0 .LBB103_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB103_4
+; VI-NEXT: s_cbranch_execnz .LBB103_3
; VI-NEXT: .LBB103_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v0, s4, v1
-; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: s_lshl_b32 s5, s30, 16
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s5, s30, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s5, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_lshl_b32 s5, s31, 16
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s5, v1
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_bfe_u32 v7, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: s_and_b32 s5, s31, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s5, v1
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_bfe_u32 v16, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; VI-NEXT: v_add_f32_e32 v2, s4, v1
-; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s4, v1
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16
-; VI-NEXT: v_add_f32_e32 v16, s4, v1
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_add_f32_e32 v17, s4, v1
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_bfe_u32 v11, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v18, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; VI-NEXT: v_bfe_u32 v13, v10, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17
+; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v28, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc
+; VI-NEXT: v_bfe_u32 v28, v17, 16, 1
+; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
+; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16
-; VI-NEXT: s_branch .LBB103_5
-; VI-NEXT: .LBB103_3:
-; VI-NEXT: s_branch .LBB103_2
-; VI-NEXT: .LBB103_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v2, s18
-; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: v_mov_b32_e32 v4, s20
-; VI-NEXT: v_mov_b32_e32 v5, s21
-; VI-NEXT: v_mov_b32_e32 v6, s22
-; VI-NEXT: v_mov_b32_e32 v7, s23
-; VI-NEXT: v_mov_b32_e32 v8, s24
-; VI-NEXT: v_mov_b32_e32 v9, s25
-; VI-NEXT: v_mov_b32_e32 v10, s26
-; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: v_mov_b32_e32 v12, s28
-; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: v_mov_b32_e32 v14, s30
-; VI-NEXT: v_mov_b32_e32 v15, s31
-; VI-NEXT: .LBB103_5: ; %end
-; VI-NEXT: v_readlane_b32 s31, v20, 1
-; VI-NEXT: v_readlane_b32 s30, v20, 0
-; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; VI-NEXT: v_bfe_u32 v17, v12, 16, 1
+; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12
+; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13
+; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; VI-NEXT: v_bfe_u32 v30, v17, 16, 1
+; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc
+; VI-NEXT: v_bfe_u32 v30, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
+; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_bfe_u32 v31, v14, 16, 1
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc
+; VI-NEXT: v_bfe_u32 v31, v30, 16, 1
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_bfe_u32 v32, v14, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14
+; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25]
+; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT: v_mov_b32_e32 v21, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27]
+; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_mov_b32_e32 v19, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29]
+; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32
+; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT: v_mov_b32_e32 v17, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15]
+; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31]
+; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13]
+; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11]
+; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9]
+; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
+; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v15, v23
+; VI-NEXT: .LBB103_3: ; %end
+; VI-NEXT: v_mov_b32_e32 v1, v22
+; VI-NEXT: v_mov_b32_e32 v3, v21
+; VI-NEXT: v_mov_b32_e32 v5, v20
+; VI-NEXT: v_mov_b32_e32 v7, v19
+; VI-NEXT: v_mov_b32_e32 v9, v18
+; VI-NEXT: v_mov_b32_e32 v11, v17
+; VI-NEXT: v_mov_b32_e32 v13, v16
; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB103_4:
+; VI-NEXT: s_branch .LBB103_2
;
; GFX9-LABEL: bitcast_v32bf16_to_v32f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_readfirstlane_b32 s30, v0
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s31, v1
-; GFX9-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB103_4
-; GFX9-NEXT: .LBB103_2: ; %cmp.true
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v1, s5, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s5, s30, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s5, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s5, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: s_lshl_b32 s5, s31, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add_f32_e32 v4, s5, v0
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff
-; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v4
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s29, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s28, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s27, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s26, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s25, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s24, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s23, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s22, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s21, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s20, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s19, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc
-; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s18, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v2
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v0
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v17, v17, v1
-; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_lshl_b32 s4, s17, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
-; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v17, s4, v0
-; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
-; GFX9-NEXT: s_lshl_b32 s4, s16, 16
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v18, v18, v0
-; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
-; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16
-; GFX9-NEXT: s_branch .LBB103_5
-; GFX9-NEXT: .LBB103_3:
-; GFX9-NEXT: s_branch .LBB103_2
-; GFX9-NEXT: .LBB103_4:
+; GFX9-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
@@ -75522,17 +75958,321 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: v_mov_b32_e32 v10, s26
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: v_mov_b32_e32 v14, s30
-; GFX9-NEXT: v_mov_b32_e32 v15, s31
-; GFX9-NEXT: .LBB103_5: ; %end
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_cbranch_scc0 .LBB103_4
+; GFX9-NEXT: ; %bb.1: ; %cmp.false
+; GFX9-NEXT: s_cbranch_execnz .LBB103_3
+; GFX9-NEXT: .LBB103_2: ; %cmp.true
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v17, v17, v0
+; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v18, v18, v1
+; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
+; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v19, v19, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v19, v19, v2
+; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3
+; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v20, v20, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20
+; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v20, v20, v3
+; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20
+; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v21, v21, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21
+; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v21, v21, v4
+; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21
+; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v22, v22, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22
+; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v22, v22, v5
+; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22
+; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
+; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v23, v23, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23
+; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v23, v23, v6
+; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23
+; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7
+; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v24, v24, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v24, v24, v7
+; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v25, v25, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v25, v25, v8
+; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc
+; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v26, v26, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v26, v26, v9
+; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc
+; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10
+; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v27, v27, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v27, v27, v10
+; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc
+; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11
+; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v28, v28, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v28, v28, v11
+; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc
+; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12
+; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v29, v29, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29
+; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v29, v29, v12
+; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29
+; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc
+; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v30, v30, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30
+; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc
+; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v30, v30, v13
+; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30
+; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc
+; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14
+; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v31, v31, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31
+; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc
+; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v31, v31, v14
+; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31
+; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc
+; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v32, v32, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v32, v32, v15
+; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc
+; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_and_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_and_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_and_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_and_b32_sdwa v12, v32, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_and_b32_sdwa v11, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_and_b32_sdwa v10, v32, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_and_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_and_b32_sdwa v8, v32, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_and_b32_sdwa v7, v32, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_and_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_and_b32_sdwa v5, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_and_b32_sdwa v4, v32, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_and_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_and_b32_sdwa v2, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v15, v31, 16, v15
+; GFX9-NEXT: v_lshl_or_b32 v14, v30, 16, v14
+; GFX9-NEXT: v_lshl_or_b32 v13, v29, 16, v13
+; GFX9-NEXT: v_lshl_or_b32 v12, v28, 16, v12
+; GFX9-NEXT: v_lshl_or_b32 v11, v27, 16, v11
+; GFX9-NEXT: v_lshl_or_b32 v10, v26, 16, v10
+; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9
+; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8
+; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7
+; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6
+; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5
+; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4
+; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3
+; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2
+; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0
+; GFX9-NEXT: .LBB103_3: ; %end
; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: .LBB103_4:
+; GFX9-NEXT: s_branch .LBB103_2
;
; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -76957,24 +77697,8 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v9
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -77002,6 +77726,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -77314,23 +78054,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29
; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77354,6 +78078,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -77626,25 +78366,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77671,6 +78395,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -78863,7 +79603,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v63, s30, 0
; VI-NEXT: v_writelane_b32 v63, s31, 1
@@ -78884,10 +79624,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_writelane_b32 v63, s64, 16
; VI-NEXT: v_writelane_b32 v63, s65, 17
; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -78906,232 +79674,230 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB105_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s75, s5, 24
+; VI-NEXT: s_lshr_b32 s72, s5, 24
; VI-NEXT: s_lshr_b32 s36, s5, 16
-; VI-NEXT: s_lshr_b32 s58, s5, 8
+; VI-NEXT: s_lshr_b32 s56, s5, 8
; VI-NEXT: s_lshr_b32 s37, s4, 16
-; VI-NEXT: s_lshr_b32 s56, s4, 8
-; VI-NEXT: s_lshr_b32 s77, s29, 24
-; VI-NEXT: s_lshr_b32 s38, s29, 16
-; VI-NEXT: s_lshr_b32 s62, s29, 8
-; VI-NEXT: s_lshr_b32 s39, s28, 16
-; VI-NEXT: s_lshr_b32 s57, s28, 8
-; VI-NEXT: s_lshr_b32 s79, s27, 24
-; VI-NEXT: s_lshr_b32 s48, s27, 16
-; VI-NEXT: s_lshr_b32 s74, s27, 8
-; VI-NEXT: s_lshr_b32 s49, s26, 16
-; VI-NEXT: s_lshr_b32 s59, s26, 8
-; VI-NEXT: s_lshr_b32 s89, s25, 24
-; VI-NEXT: s_lshr_b32 s50, s25, 16
-; VI-NEXT: s_lshr_b32 s76, s25, 8
-; VI-NEXT: s_lshr_b32 s51, s24, 16
-; VI-NEXT: s_lshr_b32 s60, s24, 8
-; VI-NEXT: s_lshr_b32 s91, s23, 24
-; VI-NEXT: s_lshr_b32 s52, s23, 16
-; VI-NEXT: s_lshr_b32 s78, s23, 8
-; VI-NEXT: s_lshr_b32 s53, s22, 16
-; VI-NEXT: s_lshr_b32 s61, s22, 8
-; VI-NEXT: s_lshr_b32 s31, s21, 24
-; VI-NEXT: s_lshr_b32 s54, s21, 16
-; VI-NEXT: s_lshr_b32 s88, s21, 8
-; VI-NEXT: s_lshr_b32 s55, s20, 16
-; VI-NEXT: s_lshr_b32 s63, s20, 8
-; VI-NEXT: s_lshr_b32 s34, s19, 24
-; VI-NEXT: s_lshr_b32 s64, s19, 16
-; VI-NEXT: s_lshr_b32 s90, s19, 8
-; VI-NEXT: s_lshr_b32 s65, s18, 16
-; VI-NEXT: s_lshr_b32 s72, s18, 8
-; VI-NEXT: s_lshr_b32 s35, s17, 24
-; VI-NEXT: s_lshr_b32 s66, s17, 16
-; VI-NEXT: s_lshr_b32 s30, s17, 8
-; VI-NEXT: s_lshr_b32 s67, s16, 16
-; VI-NEXT: s_lshr_b32 s73, s16, 8
+; VI-NEXT: s_lshr_b32 s57, s4, 8
+; VI-NEXT: s_lshr_b32 s75, s7, 24
+; VI-NEXT: s_lshr_b32 s38, s7, 16
+; VI-NEXT: s_lshr_b32 s58, s7, 8
+; VI-NEXT: s_lshr_b32 s39, s6, 16
+; VI-NEXT: s_lshr_b32 s59, s6, 8
+; VI-NEXT: s_lshr_b32 s77, s9, 24
+; VI-NEXT: s_lshr_b32 s48, s9, 16
+; VI-NEXT: s_lshr_b32 s60, s9, 8
+; VI-NEXT: s_lshr_b32 s49, s8, 16
+; VI-NEXT: s_lshr_b32 s61, s8, 8
+; VI-NEXT: s_lshr_b32 s79, s11, 24
+; VI-NEXT: s_lshr_b32 s50, s11, 16
+; VI-NEXT: s_lshr_b32 s62, s11, 8
+; VI-NEXT: s_lshr_b32 s51, s10, 16
+; VI-NEXT: s_lshr_b32 s63, s10, 8
+; VI-NEXT: s_lshr_b32 s90, s13, 24
+; VI-NEXT: s_lshr_b32 s52, s13, 16
+; VI-NEXT: s_lshr_b32 s73, s13, 8
+; VI-NEXT: s_lshr_b32 s53, s12, 16
+; VI-NEXT: s_lshr_b32 s74, s12, 8
+; VI-NEXT: s_lshr_b32 s31, s15, 24
+; VI-NEXT: s_lshr_b32 s54, s15, 16
+; VI-NEXT: s_lshr_b32 s76, s15, 8
+; VI-NEXT: s_lshr_b32 s55, s14, 16
+; VI-NEXT: s_lshr_b32 s78, s14, 8
+; VI-NEXT: s_lshr_b32 s34, s17, 24
+; VI-NEXT: s_lshr_b32 s64, s17, 16
+; VI-NEXT: s_lshr_b32 s88, s17, 8
+; VI-NEXT: s_lshr_b32 s65, s16, 16
+; VI-NEXT: s_lshr_b32 s89, s16, 8
+; VI-NEXT: s_lshr_b32 s35, s19, 24
+; VI-NEXT: s_lshr_b32 s66, s19, 16
+; VI-NEXT: s_lshr_b32 s91, s19, 8
+; VI-NEXT: s_lshr_b32 s67, s18, 16
+; VI-NEXT: s_lshr_b32 s30, s18, 8
; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; VI-NEXT: s_cbranch_execnz .LBB105_4
; VI-NEXT: .LBB105_2: ; %cmp.true
-; VI-NEXT: s_lshr_b32 s6, s17, 16
+; VI-NEXT: s_lshr_b32 s20, s19, 16
; VI-NEXT: v_mov_b32_e32 v1, 0x200
-; VI-NEXT: v_add_f16_e32 v12, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s16, 16
+; VI-NEXT: v_add_f16_e32 v12, s20, v1
+; VI-NEXT: v_add_f16_e32 v27, s19, v1
+; VI-NEXT: s_lshr_b32 s19, s18, 16
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; VI-NEXT: v_add_f16_e32 v27, s17, v1
-; VI-NEXT: v_add_f16_e32 v19, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s19, 16
+; VI-NEXT: v_add_f16_e32 v19, s19, v1
+; VI-NEXT: v_add_f16_e32 v35, s18, v1
+; VI-NEXT: s_lshr_b32 s18, s17, 16
; VI-NEXT: v_or_b32_e32 v10, v27, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; VI-NEXT: v_add_f16_e32 v35, s16, v1
-; VI-NEXT: v_add_f16_e32 v13, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s18, 16
+; VI-NEXT: v_add_f16_e32 v13, s18, v1
+; VI-NEXT: v_add_f16_e32 v28, s17, v1
+; VI-NEXT: s_lshr_b32 s17, s16, 16
; VI-NEXT: v_or_b32_e32 v9, v35, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13
-; VI-NEXT: v_add_f16_e32 v28, s19, v1
-; VI-NEXT: v_add_f16_e32 v20, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s21, 16
-; VI-NEXT: v_or_b32_e32 v62, v28, v2
+; VI-NEXT: v_add_f16_e32 v20, s17, v1
+; VI-NEXT: v_add_f16_e32 v36, s16, v1
+; VI-NEXT: s_lshr_b32 s16, s15, 16
+; VI-NEXT: v_or_b32_e32 v52, v28, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20
-; VI-NEXT: v_add_f16_e32 v36, s18, v1
-; VI-NEXT: v_add_f16_e32 v14, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s20, 16
-; VI-NEXT: v_or_b32_e32 v61, v36, v2
+; VI-NEXT: v_add_f16_e32 v14, s16, v1
+; VI-NEXT: v_add_f16_e32 v29, s15, v1
+; VI-NEXT: s_lshr_b32 s15, s14, 16
+; VI-NEXT: v_or_b32_e32 v51, v36, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
-; VI-NEXT: v_add_f16_e32 v29, s21, v1
-; VI-NEXT: v_add_f16_e32 v21, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s23, 16
+; VI-NEXT: v_add_f16_e32 v21, s15, v1
+; VI-NEXT: v_add_f16_e32 v37, s14, v1
+; VI-NEXT: s_lshr_b32 s14, s13, 16
; VI-NEXT: v_or_b32_e32 v8, v29, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21
-; VI-NEXT: v_add_f16_e32 v37, s20, v1
-; VI-NEXT: v_add_f16_e32 v15, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s22, 16
+; VI-NEXT: v_add_f16_e32 v15, s14, v1
+; VI-NEXT: v_add_f16_e32 v30, s13, v1
+; VI-NEXT: s_lshr_b32 s13, s12, 16
; VI-NEXT: v_or_b32_e32 v7, v37, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; VI-NEXT: v_add_f16_e32 v30, s23, v1
-; VI-NEXT: v_add_f16_e32 v22, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s25, 16
-; VI-NEXT: v_or_b32_e32 v47, v30, v2
+; VI-NEXT: v_add_f16_e32 v22, s13, v1
+; VI-NEXT: v_add_f16_e32 v38, s12, v1
+; VI-NEXT: s_lshr_b32 s12, s11, 16
+; VI-NEXT: v_or_b32_e32 v57, v30, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; VI-NEXT: v_add_f16_e32 v38, s22, v1
-; VI-NEXT: v_add_f16_e32 v16, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s24, 16
-; VI-NEXT: v_or_b32_e32 v46, v38, v2
+; VI-NEXT: v_add_f16_e32 v16, s12, v1
+; VI-NEXT: v_add_f16_e32 v31, s11, v1
+; VI-NEXT: s_lshr_b32 s11, s10, 16
+; VI-NEXT: v_or_b32_e32 v56, v38, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16
-; VI-NEXT: v_add_f16_e32 v31, s25, v1
-; VI-NEXT: v_add_f16_e32 v23, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s27, 16
+; VI-NEXT: v_add_f16_e32 v23, s11, v1
+; VI-NEXT: v_add_f16_e32 v39, s10, v1
+; VI-NEXT: s_lshr_b32 s10, s9, 16
; VI-NEXT: v_or_b32_e32 v6, v31, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23
-; VI-NEXT: v_add_f16_e32 v39, s24, v1
-; VI-NEXT: v_add_f16_e32 v17, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s26, 16
+; VI-NEXT: v_add_f16_e32 v17, s10, v1
+; VI-NEXT: v_add_f16_e32 v32, s9, v1
+; VI-NEXT: s_lshr_b32 s9, s8, 16
; VI-NEXT: v_or_b32_e32 v5, v39, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17
-; VI-NEXT: v_add_f16_e32 v32, s27, v1
-; VI-NEXT: v_add_f16_e32 v24, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s29, 16
+; VI-NEXT: v_add_f16_e32 v24, s9, v1
+; VI-NEXT: v_add_f16_e32 v48, s8, v1
+; VI-NEXT: s_lshr_b32 s8, s7, 16
; VI-NEXT: v_or_b32_e32 v43, v32, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24
-; VI-NEXT: v_add_f16_e32 v48, s26, v1
-; VI-NEXT: v_add_f16_e32 v18, s6, v1
-; VI-NEXT: s_lshr_b32 s6, s28, 16
+; VI-NEXT: v_add_f16_e32 v18, s8, v1
+; VI-NEXT: v_add_f16_e32 v33, s7, v1
+; VI-NEXT: s_lshr_b32 s7, s6, 16
; VI-NEXT: v_or_b32_e32 v42, v48, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; VI-NEXT: v_add_f16_e32 v33, s29, v1
-; VI-NEXT: v_add_f16_e32 v25, s6, v1
+; VI-NEXT: v_add_f16_e32 v25, s7, v1
+; VI-NEXT: v_add_f16_e32 v49, s6, v1
; VI-NEXT: s_lshr_b32 s6, s5, 16
; VI-NEXT: v_or_b32_e32 v55, v33, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25
-; VI-NEXT: v_add_f16_e32 v49, s28, v1
; VI-NEXT: v_add_f16_e32 v11, s6, v1
; VI-NEXT: v_add_f16_e32 v34, s5, v1
; VI-NEXT: s_lshr_b32 s5, s4, 16
; VI-NEXT: v_or_b32_e32 v54, v49, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
; VI-NEXT: v_add_f16_e32 v26, s5, v1
-; VI-NEXT: v_or_b32_e32 v52, v34, v2
+; VI-NEXT: v_or_b32_e32 v41, v34, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26
; VI-NEXT: v_add_f16_e32 v50, s4, v1
-; VI-NEXT: v_or_b32_e32 v51, v50, v2
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[51:52]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v40, v50, v2
+; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[40:41]
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[54:55]
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43]
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5
; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v41
; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42
; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6
-; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
+; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[56:57]
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7
; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v52
-; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47
-; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8
-; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[61:62]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v56
+; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
+; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[51:52]
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v40
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v51
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v9
; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10]
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51
-; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v55
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v55
; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54
; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v46
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v62
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v61
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v10
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v57
+; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v52
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
; VI-NEXT: v_bfe_u32 v9, v11, 8, 8
; VI-NEXT: v_bfe_u32 v10, v18, 8, 8
; VI-NEXT: v_bfe_u32 v40, v17, 8, 8
; VI-NEXT: v_bfe_u32 v43, v16, 8, 8
; VI-NEXT: v_bfe_u32 v46, v15, 8, 8
; VI-NEXT: v_bfe_u32 v57, v14, 8, 8
-; VI-NEXT: v_bfe_u32 v59, v13, 8, 8
+; VI-NEXT: v_bfe_u32 v60, v13, 8, 8
; VI-NEXT: v_bfe_u32 v62, v12, 8, 8
; VI-NEXT: s_branch .LBB105_5
; VI-NEXT: .LBB105_3:
-; VI-NEXT: ; implicit-def: $sgpr73
-; VI-NEXT: ; implicit-def: $sgpr67
-; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: ; implicit-def: $sgpr30
+; VI-NEXT: ; implicit-def: $sgpr67
+; VI-NEXT: ; implicit-def: $sgpr20
+; VI-NEXT: ; implicit-def: $sgpr91
; VI-NEXT: ; implicit-def: $sgpr66
; VI-NEXT: ; implicit-def: $sgpr35
-; VI-NEXT: ; implicit-def: $sgpr72
+; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr8
-; VI-NEXT: ; implicit-def: $sgpr90
+; VI-NEXT: ; implicit-def: $sgpr22
+; VI-NEXT: ; implicit-def: $sgpr88
; VI-NEXT: ; implicit-def: $sgpr64
; VI-NEXT: ; implicit-def: $sgpr34
-; VI-NEXT: ; implicit-def: $sgpr63
+; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr55
-; VI-NEXT: ; implicit-def: $sgpr10
-; VI-NEXT: ; implicit-def: $sgpr88
+; VI-NEXT: ; implicit-def: $sgpr24
+; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr54
; VI-NEXT: ; implicit-def: $sgpr31
-; VI-NEXT: ; implicit-def: $sgpr61
+; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr53
-; VI-NEXT: ; implicit-def: $sgpr12
-; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr26
+; VI-NEXT: ; implicit-def: $sgpr73
; VI-NEXT: ; implicit-def: $sgpr52
-; VI-NEXT: ; implicit-def: $sgpr91
-; VI-NEXT: ; implicit-def: $sgpr60
+; VI-NEXT: ; implicit-def: $sgpr90
+; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: ; implicit-def: $sgpr51
-; VI-NEXT: ; implicit-def: $sgpr14
-; VI-NEXT: ; implicit-def: $sgpr76
+; VI-NEXT: ; implicit-def: $sgpr28
+; VI-NEXT: ; implicit-def: $sgpr62
; VI-NEXT: ; implicit-def: $sgpr50
-; VI-NEXT: ; implicit-def: $sgpr89
-; VI-NEXT: ; implicit-def: $sgpr59
+; VI-NEXT: ; implicit-def: $sgpr79
+; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: ; implicit-def: $sgpr49
; VI-NEXT: ; implicit-def: $sgpr40
-; VI-NEXT: ; implicit-def: $sgpr74
+; VI-NEXT: ; implicit-def: $sgpr60
; VI-NEXT: ; implicit-def: $sgpr48
-; VI-NEXT: ; implicit-def: $sgpr79
-; VI-NEXT: ; implicit-def: $sgpr57
+; VI-NEXT: ; implicit-def: $sgpr77
+; VI-NEXT: ; implicit-def: $sgpr59
; VI-NEXT: ; implicit-def: $sgpr39
; VI-NEXT: ; implicit-def: $sgpr42
-; VI-NEXT: ; implicit-def: $sgpr62
+; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: ; implicit-def: $sgpr38
-; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr56
+; VI-NEXT: ; implicit-def: $sgpr75
+; VI-NEXT: ; implicit-def: $sgpr57
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr44
-; VI-NEXT: ; implicit-def: $sgpr58
+; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr36
-; VI-NEXT: ; implicit-def: $sgpr75
+; VI-NEXT: ; implicit-def: $sgpr72
; VI-NEXT: s_branch .LBB105_2
; VI-NEXT: .LBB105_4:
-; VI-NEXT: v_mov_b32_e32 v53, s56
-; VI-NEXT: v_mov_b32_e32 v52, s42
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v2, s57
+; VI-NEXT: v_mov_b32_e32 v53, s58
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v2, s56
; VI-NEXT: v_mov_b32_e32 v52, s44
-; VI-NEXT: v_mov_b32_e32 v1, s58
; VI-NEXT: v_mov_b32_e32 v19, s67
; VI-NEXT: v_mov_b32_e32 v12, s66
; VI-NEXT: v_mov_b32_e32 v20, s65
@@ -79148,96 +79914,96 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v18, s38
; VI-NEXT: v_mov_b32_e32 v26, s37
; VI-NEXT: v_mov_b32_e32 v11, s36
-; VI-NEXT: v_mov_b32_e32 v35, s16
-; VI-NEXT: v_mov_b32_e32 v27, s17
-; VI-NEXT: v_mov_b32_e32 v36, s18
-; VI-NEXT: v_mov_b32_e32 v28, s19
-; VI-NEXT: v_mov_b32_e32 v37, s20
-; VI-NEXT: v_mov_b32_e32 v29, s21
-; VI-NEXT: v_mov_b32_e32 v38, s22
-; VI-NEXT: v_mov_b32_e32 v30, s23
-; VI-NEXT: v_mov_b32_e32 v39, s24
-; VI-NEXT: v_mov_b32_e32 v31, s25
-; VI-NEXT: v_mov_b32_e32 v48, s26
-; VI-NEXT: v_mov_b32_e32 v32, s27
-; VI-NEXT: v_mov_b32_e32 v49, s28
-; VI-NEXT: v_mov_b32_e32 v33, s29
+; VI-NEXT: v_mov_b32_e32 v35, s18
+; VI-NEXT: v_mov_b32_e32 v27, s19
+; VI-NEXT: v_mov_b32_e32 v36, s16
+; VI-NEXT: v_mov_b32_e32 v28, s17
+; VI-NEXT: v_mov_b32_e32 v37, s14
+; VI-NEXT: v_mov_b32_e32 v29, s15
+; VI-NEXT: v_mov_b32_e32 v38, s12
+; VI-NEXT: v_mov_b32_e32 v30, s13
+; VI-NEXT: v_mov_b32_e32 v39, s10
+; VI-NEXT: v_mov_b32_e32 v31, s11
+; VI-NEXT: v_mov_b32_e32 v48, s8
+; VI-NEXT: v_mov_b32_e32 v32, s9
+; VI-NEXT: v_mov_b32_e32 v49, s6
+; VI-NEXT: v_mov_b32_e32 v33, s7
; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_mov_b32_e32 v34, s5
; VI-NEXT: v_mov_b32_e32 v62, s35
-; VI-NEXT: v_mov_b32_e32 v59, s34
+; VI-NEXT: v_mov_b32_e32 v60, s34
; VI-NEXT: v_mov_b32_e32 v57, s31
-; VI-NEXT: v_mov_b32_e32 v46, s91
-; VI-NEXT: v_mov_b32_e32 v43, s89
-; VI-NEXT: v_mov_b32_e32 v40, s79
-; VI-NEXT: v_mov_b32_e32 v10, s77
-; VI-NEXT: v_mov_b32_e32 v61, s30
-; VI-NEXT: v_mov_b32_e32 v58, s90
-; VI-NEXT: v_mov_b32_e32 v47, s88
-; VI-NEXT: v_mov_b32_e32 v45, s78
-; VI-NEXT: v_mov_b32_e32 v42, s76
-; VI-NEXT: v_mov_b32_e32 v55, s74
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v54, s57
-; VI-NEXT: v_mov_b32_e32 v41, s59
-; VI-NEXT: v_mov_b32_e32 v44, s60
-; VI-NEXT: v_mov_b32_e32 v56, s61
-; VI-NEXT: v_mov_b32_e32 v60, s63
-; VI-NEXT: v_mov_b32_e32 v51, s72
-; VI-NEXT: v_mov_b32_e32 v1, s73
-; VI-NEXT: v_mov_b32_e32 v8, s6
-; VI-NEXT: v_mov_b32_e32 v7, s8
-; VI-NEXT: v_mov_b32_e32 v6, s10
-; VI-NEXT: v_mov_b32_e32 v5, s12
-; VI-NEXT: v_mov_b32_e32 v4, s14
+; VI-NEXT: v_mov_b32_e32 v46, s90
+; VI-NEXT: v_mov_b32_e32 v43, s79
+; VI-NEXT: v_mov_b32_e32 v40, s77
+; VI-NEXT: v_mov_b32_e32 v10, s75
+; VI-NEXT: v_mov_b32_e32 v9, s72
+; VI-NEXT: v_mov_b32_e32 v51, s30
+; VI-NEXT: v_mov_b32_e32 v1, s91
+; VI-NEXT: v_mov_b32_e32 v61, s89
+; VI-NEXT: v_mov_b32_e32 v59, s88
+; VI-NEXT: v_mov_b32_e32 v58, s78
+; VI-NEXT: v_mov_b32_e32 v56, s76
+; VI-NEXT: v_mov_b32_e32 v47, s74
+; VI-NEXT: v_mov_b32_e32 v45, s73
+; VI-NEXT: v_mov_b32_e32 v44, s63
+; VI-NEXT: v_mov_b32_e32 v42, s62
+; VI-NEXT: v_mov_b32_e32 v41, s61
+; VI-NEXT: v_mov_b32_e32 v55, s60
+; VI-NEXT: v_mov_b32_e32 v54, s59
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v7, s22
+; VI-NEXT: v_mov_b32_e32 v6, s24
+; VI-NEXT: v_mov_b32_e32 v5, s26
+; VI-NEXT: v_mov_b32_e32 v4, s28
; VI-NEXT: v_mov_b32_e32 v3, s40
-; VI-NEXT: v_mov_b32_e32 v9, s75
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v52, s62
+; VI-NEXT: v_mov_b32_e32 v2, s42
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: .LBB105_5: ; %end
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v51
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; VI-NEXT: v_or_b32_sdwa v8, v35, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62
; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v51
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58
-; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v60
; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57
; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -79279,25 +80045,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
+; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10
+; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
+; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: v_readlane_b32 s67, v63, 19
; VI-NEXT: v_readlane_b32 s66, v63, 18
; VI-NEXT: v_readlane_b32 s65, v63, 17
@@ -79318,30 +80082,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10
-; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53
-; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9
; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
@@ -79350,8 +80100,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -79377,10 +80142,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v63, s52, 12
; GFX9-NEXT: v_writelane_b32 v63, s53, 13
; GFX9-NEXT: v_writelane_b32 v63, s54, 14
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX9-NEXT: v_writelane_b32 v63, s55, 15
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -79404,66 +80197,66 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s59, s5, 8
; GFX9-NEXT: s_lshr_b32 s58, s4, 16
; GFX9-NEXT: s_lshr_b32 s60, s4, 8
-; GFX9-NEXT: s_lshr_b32 s61, s29, 24
-; GFX9-NEXT: s_lshr_b32 s62, s29, 16
-; GFX9-NEXT: s_lshr_b32 s72, s29, 8
-; GFX9-NEXT: s_lshr_b32 s63, s28, 16
-; GFX9-NEXT: s_lshr_b32 s73, s28, 8
-; GFX9-NEXT: s_lshr_b32 s74, s27, 24
-; GFX9-NEXT: s_lshr_b32 s75, s27, 16
-; GFX9-NEXT: s_lshr_b32 s77, s27, 8
-; GFX9-NEXT: s_lshr_b32 s76, s26, 16
-; GFX9-NEXT: s_lshr_b32 s78, s26, 8
-; GFX9-NEXT: s_lshr_b32 s79, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s90, s25, 8
-; GFX9-NEXT: s_lshr_b32 s89, s24, 16
-; GFX9-NEXT: s_lshr_b32 s91, s24, 8
-; GFX9-NEXT: s_lshr_b32 s92, s23, 24
-; GFX9-NEXT: s_lshr_b32 s93, s23, 16
-; GFX9-NEXT: s_lshr_b32 s95, s23, 8
-; GFX9-NEXT: s_lshr_b32 s94, s22, 16
-; GFX9-NEXT: s_lshr_b32 s30, s22, 8
-; GFX9-NEXT: s_lshr_b32 s31, s21, 24
-; GFX9-NEXT: s_lshr_b32 s34, s21, 16
-; GFX9-NEXT: s_lshr_b32 s36, s21, 8
-; GFX9-NEXT: s_lshr_b32 s35, s20, 16
-; GFX9-NEXT: s_lshr_b32 s37, s20, 8
-; GFX9-NEXT: s_lshr_b32 s38, s19, 24
-; GFX9-NEXT: s_lshr_b32 s39, s19, 16
-; GFX9-NEXT: s_lshr_b32 s49, s19, 8
-; GFX9-NEXT: s_lshr_b32 s48, s18, 16
-; GFX9-NEXT: s_lshr_b32 s50, s18, 8
-; GFX9-NEXT: s_lshr_b32 s51, s17, 24
-; GFX9-NEXT: s_lshr_b32 s52, s17, 16
-; GFX9-NEXT: s_lshr_b32 s54, s17, 8
-; GFX9-NEXT: s_lshr_b32 s53, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
+; GFX9-NEXT: s_lshr_b32 s61, s7, 24
+; GFX9-NEXT: s_lshr_b32 s62, s7, 16
+; GFX9-NEXT: s_lshr_b32 s72, s7, 8
+; GFX9-NEXT: s_lshr_b32 s63, s6, 16
+; GFX9-NEXT: s_lshr_b32 s73, s6, 8
+; GFX9-NEXT: s_lshr_b32 s74, s9, 24
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s77, s9, 8
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s78, s8, 8
+; GFX9-NEXT: s_lshr_b32 s79, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s90, s11, 8
+; GFX9-NEXT: s_lshr_b32 s89, s10, 16
+; GFX9-NEXT: s_lshr_b32 s91, s10, 8
+; GFX9-NEXT: s_lshr_b32 s92, s13, 24
+; GFX9-NEXT: s_lshr_b32 s93, s13, 16
+; GFX9-NEXT: s_lshr_b32 s95, s13, 8
+; GFX9-NEXT: s_lshr_b32 s94, s12, 16
+; GFX9-NEXT: s_lshr_b32 s30, s12, 8
+; GFX9-NEXT: s_lshr_b32 s31, s15, 24
+; GFX9-NEXT: s_lshr_b32 s34, s15, 16
+; GFX9-NEXT: s_lshr_b32 s36, s15, 8
+; GFX9-NEXT: s_lshr_b32 s35, s14, 16
+; GFX9-NEXT: s_lshr_b32 s37, s14, 8
+; GFX9-NEXT: s_lshr_b32 s38, s17, 24
+; GFX9-NEXT: s_lshr_b32 s39, s17, 16
+; GFX9-NEXT: s_lshr_b32 s49, s17, 8
+; GFX9-NEXT: s_lshr_b32 s48, s16, 16
+; GFX9-NEXT: s_lshr_b32 s50, s16, 8
+; GFX9-NEXT: s_lshr_b32 s51, s19, 24
+; GFX9-NEXT: s_lshr_b32 s52, s19, 16
+; GFX9-NEXT: s_lshr_b32 s54, s19, 8
+; GFX9-NEXT: s_lshr_b32 s53, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB105_4
; GFX9-NEXT: .LBB105_2: ; %cmp.true
; GFX9-NEXT: v_mov_b32_e32 v1, 0x200
-; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v9, s22, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v8, s25, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v7, s24, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v6, s27, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v5, s26, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v4, s29, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v20, s19, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v19, s18, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v16, s17, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v15, s16, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v12, s15, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v11, s14, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v10, s13, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v9, s12, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v8, s11, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v7, s10, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v6, s9, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v5, s8, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v4, s7, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v3, s6, v1 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2]
@@ -79524,31 +80317,31 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: .LBB105_3:
; GFX9-NEXT: ; implicit-def: $sgpr55
; GFX9-NEXT: ; implicit-def: $sgpr53
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr54
; GFX9-NEXT: ; implicit-def: $sgpr52
; GFX9-NEXT: ; implicit-def: $sgpr51
; GFX9-NEXT: ; implicit-def: $sgpr50
; GFX9-NEXT: ; implicit-def: $sgpr48
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr49
; GFX9-NEXT: ; implicit-def: $sgpr39
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr37
; GFX9-NEXT: ; implicit-def: $sgpr35
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr31
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr94
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr92
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr89
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr79
@@ -79577,20 +80370,20 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s42
-; GFX9-NEXT: v_mov_b32_e32 v19, s16
-; GFX9-NEXT: v_mov_b32_e32 v20, s17
-; GFX9-NEXT: v_mov_b32_e32 v15, s18
-; GFX9-NEXT: v_mov_b32_e32 v16, s19
-; GFX9-NEXT: v_mov_b32_e32 v11, s20
-; GFX9-NEXT: v_mov_b32_e32 v12, s21
-; GFX9-NEXT: v_mov_b32_e32 v9, s22
-; GFX9-NEXT: v_mov_b32_e32 v10, s23
-; GFX9-NEXT: v_mov_b32_e32 v7, s24
-; GFX9-NEXT: v_mov_b32_e32 v8, s25
-; GFX9-NEXT: v_mov_b32_e32 v5, s26
-; GFX9-NEXT: v_mov_b32_e32 v6, s27
-; GFX9-NEXT: v_mov_b32_e32 v3, s28
-; GFX9-NEXT: v_mov_b32_e32 v4, s29
+; GFX9-NEXT: v_mov_b32_e32 v19, s18
+; GFX9-NEXT: v_mov_b32_e32 v20, s19
+; GFX9-NEXT: v_mov_b32_e32 v15, s16
+; GFX9-NEXT: v_mov_b32_e32 v16, s17
+; GFX9-NEXT: v_mov_b32_e32 v11, s14
+; GFX9-NEXT: v_mov_b32_e32 v12, s15
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: v_mov_b32_e32 v10, s13
+; GFX9-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v17, s55
@@ -79633,15 +80426,15 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v27, s59
; GFX9-NEXT: v_mov_b32_e32 v14, s57
; GFX9-NEXT: v_mov_b32_e32 v18, s56
-; GFX9-NEXT: v_mov_b32_e32 v23, s12
-; GFX9-NEXT: v_mov_b32_e32 v24, s10
-; GFX9-NEXT: v_mov_b32_e32 v25, s8
-; GFX9-NEXT: v_mov_b32_e32 v26, s6
+; GFX9-NEXT: v_mov_b32_e32 v23, s26
+; GFX9-NEXT: v_mov_b32_e32 v24, s24
+; GFX9-NEXT: v_mov_b32_e32 v25, s22
+; GFX9-NEXT: v_mov_b32_e32 v26, s20
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v21, s40
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s28
; GFX9-NEXT: .LBB105_5: ; %end
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -79719,21 +80512,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s55, v63, 15
; GFX9-NEXT: v_readlane_b32 s54, v63, 14
; GFX9-NEXT: v_readlane_b32 s53, v63, 13
@@ -79750,7 +80528,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_readlane_b32 s34, v63, 2
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
-; GFX9-NEXT: s_waitcnt vmcnt(16)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79776,6 +80554,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -85177,24 +85970,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -85222,6 +85999,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -85777,22 +86570,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
@@ -85826,6 +86603,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -86357,10 +87150,21 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17
+; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -86377,17 +87181,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54
-; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -87519,1348 +88312,1564 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: s_mov_b64 exec, s[4:5]
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_writelane_b32 v40, s30, 0
+; SI-NEXT: v_writelane_b32 v40, s31, 1
+; SI-NEXT: v_writelane_b32 v40, s34, 2
+; SI-NEXT: v_writelane_b32 v40, s35, 3
+; SI-NEXT: v_writelane_b32 v40, s36, 4
+; SI-NEXT: v_writelane_b32 v40, s37, 5
+; SI-NEXT: v_writelane_b32 v40, s38, 6
+; SI-NEXT: v_writelane_b32 v40, s39, 7
+; SI-NEXT: v_writelane_b32 v40, s48, 8
+; SI-NEXT: v_writelane_b32 v40, s49, 9
+; SI-NEXT: v_writelane_b32 v40, s50, 10
+; SI-NEXT: v_writelane_b32 v40, s51, 11
+; SI-NEXT: v_writelane_b32 v40, s52, 12
+; SI-NEXT: v_writelane_b32 v40, s53, 13
+; SI-NEXT: v_writelane_b32 v40, s54, 14
+; SI-NEXT: v_writelane_b32 v40, s55, 15
+; SI-NEXT: v_writelane_b32 v40, s64, 16
+; SI-NEXT: v_writelane_b32 v40, s65, 17
+; SI-NEXT: v_writelane_b32 v40, s66, 18
+; SI-NEXT: v_writelane_b32 v40, s67, 19
+; SI-NEXT: v_writelane_b32 v40, s68, 20
+; SI-NEXT: v_writelane_b32 v40, s69, 21
+; SI-NEXT: v_writelane_b32 v40, s70, 22
+; SI-NEXT: v_writelane_b32 v40, s71, 23
+; SI-NEXT: v_writelane_b32 v40, s80, 24
+; SI-NEXT: v_writelane_b32 v40, s81, 25
+; SI-NEXT: v_writelane_b32 v40, s82, 26
+; SI-NEXT: v_writelane_b32 v40, s83, 27
+; SI-NEXT: v_writelane_b32 v40, s84, 28
+; SI-NEXT: v_writelane_b32 v40, s85, 29
+; SI-NEXT: v_writelane_b32 v40, s86, 30
+; SI-NEXT: v_writelane_b32 v40, s87, 31
+; SI-NEXT: v_writelane_b32 v40, s96, 32
+; SI-NEXT: v_writelane_b32 v40, s97, 33
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_writelane_b32 v40, s98, 34
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28
+; SI-NEXT: v_writelane_b32 v40, s99, 35
+; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53
-; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50
-; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35
-; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57
-; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32
-; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16
-; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16
-; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16
-; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16
-; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16
-; SI-NEXT: v_readfirstlane_b32 s8, v23
-; SI-NEXT: v_readfirstlane_b32 s9, v24
-; SI-NEXT: v_readfirstlane_b32 s14, v20
-; SI-NEXT: v_readfirstlane_b32 s15, v21
-; SI-NEXT: v_readfirstlane_b32 s20, v17
-; SI-NEXT: v_readfirstlane_b32 s21, v18
-; SI-NEXT: v_readfirstlane_b32 s26, v14
-; SI-NEXT: v_readfirstlane_b32 s27, v15
-; SI-NEXT: v_readfirstlane_b32 s42, v11
-; SI-NEXT: v_readfirstlane_b32 s43, v12
-; SI-NEXT: v_readfirstlane_b32 s56, v8
-; SI-NEXT: v_readfirstlane_b32 s57, v9
-; SI-NEXT: v_readfirstlane_b32 s62, v4
-; SI-NEXT: v_readfirstlane_b32 s63, v5
-; SI-NEXT: v_readfirstlane_b32 s76, v3
-; SI-NEXT: v_readfirstlane_b32 s77, v2
-; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24
-; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8
-; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24
-; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16
-; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8
-; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8
-; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24
-; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8
-; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
-; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16
-; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8
-; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24
-; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16
-; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8
-; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24
-; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16
-; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8
-; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26
-; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24
-; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29
-; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35
-; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33
-; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40
-; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57
-; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5
-; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2
+; SI-NEXT: v_readfirstlane_b32 s4, v19
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v3
+; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v1
+; SI-NEXT: s_lshr_b32 s73, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s72, v2
+; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16
+; SI-NEXT: s_mov_b32 s75, s76
+; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_writelane_b32 v41, s4, 0
+; SI-NEXT: v_writelane_b32 v41, s5, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v6
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v7
+; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v4
+; SI-NEXT: s_lshr_b32 s59, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v10
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v11
+; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v8
+; SI-NEXT: s_lshr_b32 s45, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v12
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v13
+; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v20
+; SI-NEXT: s_lshr_b32 s25, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v24
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v25
+; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v22
+; SI-NEXT: s_lshr_b32 s41, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v28
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v29
+; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v26
+; SI-NEXT: s_lshr_b32 s19, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v33
+; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v30
+; SI-NEXT: s_lshr_b32 s11, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v18
+; SI-NEXT: v_readfirstlane_b32 s58, v5
+; SI-NEXT: v_readfirstlane_b32 s44, v9
+; SI-NEXT: v_readfirstlane_b32 s24, v21
+; SI-NEXT: v_readfirstlane_b32 s40, v23
+; SI-NEXT: v_readfirstlane_b32 s18, v27
+; SI-NEXT: v_readfirstlane_b32 s10, v31
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v17
+; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16
+; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16
+; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16
+; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16
+; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16
+; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT: s_mov_b32 s61, s62
+; SI-NEXT: s_mov_b32 s47, s56
+; SI-NEXT: s_mov_b32 s27, s42
+; SI-NEXT: s_mov_b32 s17, s22
+; SI-NEXT: s_mov_b32 s21, s28
+; SI-NEXT: s_mov_b32 s13, s14
+; SI-NEXT: s_mov_b32 s7, s8
+; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16
+; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16
+; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8
+; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24
+; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16
+; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8
+; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24
+; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16
+; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8
+; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24
+; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16
+; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1
+; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4
+; SI-NEXT: s_lshr_b32 s24, s76, 8
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; SI-NEXT: s_lshr_b32 s23, s62, 8
+; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20
+; SI-NEXT: s_lshr_b32 s18, s56, 8
+; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22
+; SI-NEXT: s_lshr_b32 s17, s42, 8
+; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26
+; SI-NEXT: s_lshr_b32 s15, s22, 8
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30
+; SI-NEXT: s_lshr_b32 s10, s28, 8
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18
+; SI-NEXT: s_lshr_b32 s9, s14, 8
+; SI-NEXT: s_lshr_b32 s4, s8, 8
+; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38
+; SI-NEXT: v_readfirstlane_b32 s4, v15
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v14
+; SI-NEXT: s_lshr_b32 s5, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52
+; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v16
+; SI-NEXT: v_readfirstlane_b32 s10, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31
+; SI-NEXT: s_lshr_b32 s11, s4, 16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15
+; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30
+; SI-NEXT: v_readfirstlane_b32 s10, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28
+; SI-NEXT: v_readfirstlane_b32 s4, v15
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_readfirstlane_b32 s16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27
+; SI-NEXT: s_lshr_b32 s11, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v17
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26
+; SI-NEXT: v_readfirstlane_b32 s18, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25
+; SI-NEXT: s_lshr_b32 s17, s4, 16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
+; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16
+; SI-NEXT: v_readfirstlane_b32 s4, v16
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_readfirstlane_b32 s16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
+; SI-NEXT: s_lshr_b32 s19, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_lshr_b32 s17, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v17
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: s_lshr_b32 s41, s4, 16
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_readfirstlane_b32 s4, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21
+; SI-NEXT: v_readfirstlane_b32 s24, v13
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_lshr_b32 s25, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s4, v12
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16
+; SI-NEXT: s_lshr_b32 s25, s4, 16
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT: v_readfirstlane_b32 s4, v10
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_lshr_b32 s45, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s44, v11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16
+; SI-NEXT: s_lshr_b32 s45, s4, 16
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25
+; SI-NEXT: v_readfirstlane_b32 s4, v6
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_lshr_b32 s59, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s58, v7
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
-; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6
-; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7
-; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32
+; SI-NEXT: v_readfirstlane_b32 s58, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
+; SI-NEXT: v_readfirstlane_b32 s4, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36
-; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16
-; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16
-; SI-NEXT: v_readfirstlane_b32 s76, v3
-; SI-NEXT: v_readfirstlane_b32 s77, v2
-; SI-NEXT: v_readfirstlane_b32 s62, v4
-; SI-NEXT: v_readfirstlane_b32 s63, v5
-; SI-NEXT: v_readfirstlane_b32 s56, v8
-; SI-NEXT: v_readfirstlane_b32 s57, v9
-; SI-NEXT: v_readfirstlane_b32 s42, v11
-; SI-NEXT: v_readfirstlane_b32 s43, v12
-; SI-NEXT: v_readfirstlane_b32 s26, v14
-; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16
-; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8
-; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16
-; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8
-; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16
-; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8
-; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9
-; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5
-; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36
-; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25
-; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: v_readfirstlane_b32 s20, v17
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31
-; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16
-; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16
-; SI-NEXT: v_readfirstlane_b32 s27, v15
-; SI-NEXT: v_readfirstlane_b32 s21, v18
-; SI-NEXT: v_readfirstlane_b32 s14, v20
-; SI-NEXT: v_readfirstlane_b32 s8, v23
-; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16
-; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8
-; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16
-; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8
-; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18
-; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19
-; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16
-; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16
-; SI-NEXT: v_readfirstlane_b32 s15, v21
-; SI-NEXT: v_readfirstlane_b32 s9, v24
-; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24
-; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16
-; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8
-; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24
-; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16
-; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8
-; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24
-; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24
-; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24
-; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
-; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24
-; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24
-; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8
-; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21
-; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26
-; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_lshr_b32 s59, s4, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_readfirstlane_b32 s4, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: s_lshr_b32 s73, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s72, v3
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_readfirstlane_b32 s4, v1
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16
+; SI-NEXT: s_lshr_b32 s73, s4, 16
+; SI-NEXT: v_readfirstlane_b32 s72, v2
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16
+; SI-NEXT: v_readfirstlane_b32 s40, v18
+; SI-NEXT: v_readfirstlane_b32 s24, v13
+; SI-NEXT: v_readfirstlane_b32 s44, v9
+; SI-NEXT: s_mov_b32 s75, s76
+; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16
+; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16
+; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16
+; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16
+; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16
+; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24
+; SI-NEXT: s_mov_b32 s7, s8
+; SI-NEXT: s_mov_b32 s13, s14
+; SI-NEXT: s_mov_b32 s21, s28
+; SI-NEXT: s_mov_b32 s17, s22
+; SI-NEXT: s_mov_b32 s27, s42
+; SI-NEXT: s_mov_b32 s47, s56
+; SI-NEXT: s_mov_b32 s61, s62
+; SI-NEXT: v_writelane_b32 v41, s78, 0
+; SI-NEXT: v_writelane_b32 v41, s79, 1
+; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16
+; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8
+; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16
+; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8
+; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24
+; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16
+; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8
+; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24
+; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16
+; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8
+; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24
+; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16
+; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8
+; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24
+; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16
+; SI-NEXT: s_lshr_b32 s24, s76, 8
+; SI-NEXT: s_lshr_b32 s23, s62, 8
+; SI-NEXT: s_lshr_b32 s18, s56, 8
+; SI-NEXT: s_lshr_b32 s17, s42, 8
+; SI-NEXT: s_lshr_b32 s15, s22, 8
+; SI-NEXT: s_lshr_b32 s10, s28, 8
+; SI-NEXT: s_lshr_b32 s9, s14, 8
+; SI-NEXT: s_lshr_b32 s4, s8, 8
+; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1
+; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12
+; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17
+; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14
+; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24
+; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8
+; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24
+; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8
; SI-NEXT: .LBB109_3: ; %end
-; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
-; SI-NEXT: s_lshl_b32 s5, s10, 8
-; SI-NEXT: v_or_b32_e32 v23, s5, v23
-; SI-NEXT: s_and_b32 s5, s6, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s4, s4, 24
-; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_or_b32_e32 v23, s4, v23
-; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v23, 0xff, v24
-; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT: s_lshl_b32 s4, s16, 8
-; SI-NEXT: v_or_b32_e32 v23, v23, v24
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30
-; SI-NEXT: v_or_b32_e32 v20, s4, v20
-; SI-NEXT: s_and_b32 s4, s12, 0xff
-; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT: v_or_b32_e32 v22, v24, v22
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s8, 24
-; SI-NEXT: v_or_b32_e32 v22, v23, v22
-; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0
-; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v20, s4, v20
+; SI-NEXT: s_and_b32 s7, s74, 0xff
+; SI-NEXT: s_lshl_b32 s13, s92, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s88, 0xff
+; SI-NEXT: v_readlane_b32 s74, v41, 0
+; SI-NEXT: s_lshl_b32 s21, s74, 24
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_or_b32 s13, s21, s13
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: s_and_b32 s7, s76, 0xff
+; SI-NEXT: s_lshl_b32 s13, s24, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s73, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48
+; SI-NEXT: v_or_b32_e32 v2, s13, v2
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v2, s7, v2
+; SI-NEXT: s_and_b32 s7, s60, 0xff
+; SI-NEXT: s_lshl_b32 s13, s30, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s94, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s21, s90, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s13, s21, s13
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58
-; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT: s_lshl_b32 s4, s22, 8
-; SI-NEXT: v_or_b32_e32 v20, v20, v21
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62
-; SI-NEXT: v_or_b32_e32 v17, s4, v17
-; SI-NEXT: s_and_b32 s4, s18, 0xff
-; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s14, 24
-; SI-NEXT: v_or_b32_e32 v19, v20, v19
-; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v17, s4, v17
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s62, 0xff
+; SI-NEXT: s_lshl_b32 s13, s23, 8
+; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s59, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s13, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s7, s46, 0xff
+; SI-NEXT: s_lshl_b32 s13, s38, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s36, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s21, s34, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s13, s21, s13
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s56, 0xff
+; SI-NEXT: s_lshl_b32 s13, s18, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28
-; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: s_lshl_b32 s4, s28, 8
-; SI-NEXT: v_or_b32_e32 v17, v17, v18
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31
-; SI-NEXT: v_or_b32_e32 v14, s4, v14
-; SI-NEXT: s_and_b32 s4, s24, 0xff
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT: v_or_b32_e32 v16, v18, v16
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s20, 24
-; SI-NEXT: v_or_b32_e32 v16, v17, v16
-; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v14, s4, v14
+; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s45, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s13, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s7, s26, 0xff
+; SI-NEXT: s_lshl_b32 s13, s52, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s50, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s18, s48, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s13, s18, s13
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s42, 0xff
+; SI-NEXT: s_lshl_b32 s13, s17, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: s_lshl_b32 s4, s44, 8
-; SI-NEXT: v_or_b32_e32 v14, v14, v15
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63
-; SI-NEXT: v_or_b32_e32 v11, s4, v11
-; SI-NEXT: s_and_b32 s4, s40, 0xff
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT: v_or_b32_e32 v13, v15, v13
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s26, 24
-; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v11, s4, v11
+; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s25, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s13, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s7, s16, 0xff
+; SI-NEXT: s_lshl_b32 s13, s66, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s64, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s16, s54, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s13, s16, s13
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s22, 0xff
+; SI-NEXT: s_lshl_b32 s13, s15, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT: s_lshl_b32 s4, s58, 8
-; SI-NEXT: v_or_b32_e32 v11, v11, v12
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56
-; SI-NEXT: v_or_b32_e32 v8, s4, v8
-; SI-NEXT: s_and_b32 s4, s46, 0xff
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v10, v12, v10
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s42, 24
-; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v8, s4, v8
+; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s41, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s13, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s7, s20, 0xff
+; SI-NEXT: s_lshl_b32 s13, s78, 8
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: s_and_b32 s13, s70, 0xff
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_lshl_b32 s15, s68, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s13, s15, s13
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
+; SI-NEXT: s_or_b32 s7, s7, s13
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s28, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: s_lshl_b32 s4, s72, 8
-; SI-NEXT: v_or_b32_e32 v8, v8, v9
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45
-; SI-NEXT: v_or_b32_e32 v4, s4, v4
-; SI-NEXT: s_and_b32 s4, s60, 0xff
-; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT: v_or_b32_e32 v7, v9, v7
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s56, 24
-; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v4, s4, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT: s_or_b32 s7, s7, s10
+; SI-NEXT: s_and_b32 s10, s19, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s10, s10, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s10, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s7, s12, 0xff
+; SI-NEXT: s_lshl_b32 s10, s98, 8
+; SI-NEXT: s_or_b32 s7, s7, s10
+; SI-NEXT: s_and_b32 s10, s96, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 16
+; SI-NEXT: s_lshl_b32 s12, s86, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s10, s12, s10
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT: s_or_b32 s7, s7, s10
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mov_b32_e32 v2, s7
+; SI-NEXT: s_and_b32 s7, s14, 0xff
+; SI-NEXT: s_lshl_b32 s9, s9, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: s_lshl_b32 s4, s76, 8
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, s4, v3
-; SI-NEXT: s_and_b32 s4, s74, 0xff
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s62, 24
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: v_or_b32_e32 v3, s4, v3
+; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_and_b32 s9, s11, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s9, v1
+; SI-NEXT: v_or_b32_e32 v1, s7, v1
+; SI-NEXT: s_and_b32 s6, s6, 0xff
+; SI-NEXT: s_lshl_b32 s7, s84, 8
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_and_b32 s7, s82, 0xff
+; SI-NEXT: s_lshl_b32 s7, s7, 16
+; SI-NEXT: s_lshl_b32 s9, s80, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s7, s9, s7
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_and_b32 s6, s8, 0xff
+; SI-NEXT: s_lshl_b32 s4, s4, 8
+; SI-NEXT: s_and_b32 s5, s5, 0xff
+; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT: s_or_b32 s4, s6, s4
+; SI-NEXT: s_lshl_b32 s5, s5, 16
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: v_or_b32_e32 v1, s5, v1
+; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: v_readlane_b32 s75, v41, 1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: v_readlane_b32 s99, v40, 35
+; SI-NEXT: v_readlane_b32 s98, v40, 34
+; SI-NEXT: v_readlane_b32 s97, v40, 33
+; SI-NEXT: v_readlane_b32 s96, v40, 32
+; SI-NEXT: v_readlane_b32 s87, v40, 31
+; SI-NEXT: v_readlane_b32 s86, v40, 30
+; SI-NEXT: v_readlane_b32 s85, v40, 29
+; SI-NEXT: v_readlane_b32 s84, v40, 28
+; SI-NEXT: v_readlane_b32 s83, v40, 27
+; SI-NEXT: v_readlane_b32 s82, v40, 26
+; SI-NEXT: v_readlane_b32 s81, v40, 25
+; SI-NEXT: v_readlane_b32 s80, v40, 24
+; SI-NEXT: v_readlane_b32 s71, v40, 23
+; SI-NEXT: v_readlane_b32 s70, v40, 22
+; SI-NEXT: v_readlane_b32 s69, v40, 21
+; SI-NEXT: v_readlane_b32 s68, v40, 20
+; SI-NEXT: v_readlane_b32 s67, v40, 19
+; SI-NEXT: v_readlane_b32 s66, v40, 18
+; SI-NEXT: v_readlane_b32 s65, v40, 17
+; SI-NEXT: v_readlane_b32 s64, v40, 16
+; SI-NEXT: v_readlane_b32 s55, v40, 15
+; SI-NEXT: v_readlane_b32 s54, v40, 14
+; SI-NEXT: v_readlane_b32 s53, v40, 13
+; SI-NEXT: v_readlane_b32 s52, v40, 12
+; SI-NEXT: v_readlane_b32 s51, v40, 11
+; SI-NEXT: v_readlane_b32 s50, v40, 10
+; SI-NEXT: v_readlane_b32 s49, v40, 9
+; SI-NEXT: v_readlane_b32 s48, v40, 8
+; SI-NEXT: v_readlane_b32 s39, v40, 7
+; SI-NEXT: v_readlane_b32 s38, v40, 6
+; SI-NEXT: v_readlane_b32 s37, v40, 5
+; SI-NEXT: v_readlane_b32 s36, v40, 4
+; SI-NEXT: v_readlane_b32 s35, v40, 3
+; SI-NEXT: v_readlane_b32 s34, v40, 2
+; SI-NEXT: v_readlane_b32 s31, v40, 1
+; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB109_4:
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_writelane_b32 v41, s4, 0
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: v_writelane_b32 v41, s5, 1
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr59
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $sgpr46
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $sgpr45
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr25
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $sgpr16
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; implicit-def: $sgpr22
-; SI-NEXT: ; implicit-def: $sgpr18
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $sgpr15
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $sgpr9
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $sgpr28
-; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr56
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr12
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr96
+; SI-NEXT: ; implicit-def: $sgpr86
+; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr11
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr84
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr5
; SI-NEXT: s_branch .LBB109_2
;
; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s30, 0
+; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_writelane_b32 v18, s34, 2
+; VI-NEXT: v_writelane_b32 v18, s35, 3
+; VI-NEXT: v_writelane_b32 v18, s36, 4
+; VI-NEXT: v_writelane_b32 v18, s37, 5
+; VI-NEXT: v_writelane_b32 v18, s38, 6
+; VI-NEXT: v_writelane_b32 v18, s39, 7
+; VI-NEXT: v_writelane_b32 v18, s48, 8
+; VI-NEXT: v_writelane_b32 v18, s49, 9
+; VI-NEXT: v_writelane_b32 v18, s50, 10
+; VI-NEXT: v_writelane_b32 v18, s51, 11
+; VI-NEXT: v_writelane_b32 v18, s52, 12
+; VI-NEXT: v_writelane_b32 v18, s53, 13
+; VI-NEXT: v_writelane_b32 v18, s54, 14
+; VI-NEXT: v_writelane_b32 v18, s55, 15
+; VI-NEXT: v_writelane_b32 v18, s64, 16
+; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s68, 20
+; VI-NEXT: v_writelane_b32 v18, s69, 21
+; VI-NEXT: v_writelane_b32 v18, s70, 22
+; VI-NEXT: v_writelane_b32 v18, s71, 23
+; VI-NEXT: v_writelane_b32 v18, s80, 24
+; VI-NEXT: v_writelane_b32 v18, s81, 25
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: v_mov_b32_e32 v6, s18
+; VI-NEXT: v_mov_b32_e32 v7, s19
+; VI-NEXT: v_mov_b32_e32 v8, s20
+; VI-NEXT: v_mov_b32_e32 v9, s21
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v12, s24
+; VI-NEXT: v_mov_b32_e32 v13, s25
+; VI-NEXT: v_mov_b32_e32 v14, s26
+; VI-NEXT: v_mov_b32_e32 v15, s27
+; VI-NEXT: v_mov_b32_e32 v16, s28
+; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s82, 26
+; VI-NEXT: v_readfirstlane_b32 s18, v4
+; VI-NEXT: v_readfirstlane_b32 s19, v5
+; VI-NEXT: v_readfirstlane_b32 s16, v6
+; VI-NEXT: v_readfirstlane_b32 s17, v7
+; VI-NEXT: v_readfirstlane_b32 s14, v8
+; VI-NEXT: v_readfirstlane_b32 s15, v9
+; VI-NEXT: v_readfirstlane_b32 s12, v10
+; VI-NEXT: v_readfirstlane_b32 s13, v11
+; VI-NEXT: v_readfirstlane_b32 s10, v12
+; VI-NEXT: v_readfirstlane_b32 s11, v13
+; VI-NEXT: v_readfirstlane_b32 s8, v14
+; VI-NEXT: v_readfirstlane_b32 s9, v15
+; VI-NEXT: v_readfirstlane_b32 s6, v16
+; VI-NEXT: v_readfirstlane_b32 s7, v17
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_cbranch_scc0 .LBB109_3
+; VI-NEXT: v_writelane_b32 v18, s83, 27
+; VI-NEXT: s_cbranch_scc0 .LBB109_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s56, s5, 24
-; VI-NEXT: s_lshr_b32 s57, s5, 16
-; VI-NEXT: s_lshr_b32 s59, s5, 8
-; VI-NEXT: s_lshr_b32 s58, s4, 16
-; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s72, s29, 8
-; VI-NEXT: s_lshr_b32 s63, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s77, s27, 8
-; VI-NEXT: s_lshr_b32 s76, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s90, s25, 8
-; VI-NEXT: s_lshr_b32 s89, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s35, s23, 8
-; VI-NEXT: s_lshr_b32 s34, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s48, s21, 8
-; VI-NEXT: s_lshr_b32 s39, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s53, s19, 8
-; VI-NEXT: s_lshr_b32 s52, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s66, s17, 8
-; VI-NEXT: s_lshr_b32 s65, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
-; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
-; VI-NEXT: s_cbranch_execnz .LBB109_4
+; VI-NEXT: s_lshr_b32 s21, s5, 24
+; VI-NEXT: s_lshr_b32 s23, s5, 16
+; VI-NEXT: s_lshr_b32 s25, s5, 8
+; VI-NEXT: s_lshr_b32 s27, s4, 16
+; VI-NEXT: s_lshr_b32 s29, s4, 8
+; VI-NEXT: s_lshr_b32 s41, s7, 24
+; VI-NEXT: s_lshr_b32 s47, s7, 16
+; VI-NEXT: s_lshr_b32 s57, s7, 8
+; VI-NEXT: s_lshr_b32 s88, s6, 16
+; VI-NEXT: s_lshr_b32 s89, s6, 8
+; VI-NEXT: s_lshr_b32 s90, s9, 24
+; VI-NEXT: s_lshr_b32 s91, s9, 16
+; VI-NEXT: s_lshr_b32 s30, s9, 8
+; VI-NEXT: s_lshr_b32 s31, s8, 16
+; VI-NEXT: s_lshr_b32 s34, s8, 8
+; VI-NEXT: s_lshr_b32 s35, s11, 24
+; VI-NEXT: s_lshr_b32 s36, s11, 16
+; VI-NEXT: s_lshr_b32 s37, s11, 8
+; VI-NEXT: s_lshr_b32 s38, s10, 16
+; VI-NEXT: s_lshr_b32 s39, s10, 8
+; VI-NEXT: s_lshr_b32 s48, s13, 24
+; VI-NEXT: s_lshr_b32 s49, s13, 16
+; VI-NEXT: s_lshr_b32 s50, s13, 8
+; VI-NEXT: s_lshr_b32 s51, s12, 16
+; VI-NEXT: s_lshr_b32 s52, s12, 8
+; VI-NEXT: s_lshr_b32 s53, s15, 24
+; VI-NEXT: s_lshr_b32 s54, s15, 16
+; VI-NEXT: s_lshr_b32 s55, s15, 8
+; VI-NEXT: s_lshr_b32 s64, s14, 16
+; VI-NEXT: s_lshr_b32 s65, s14, 8
+; VI-NEXT: s_lshr_b32 s66, s17, 24
+; VI-NEXT: s_lshr_b32 s67, s17, 16
+; VI-NEXT: s_lshr_b32 s68, s17, 8
+; VI-NEXT: s_lshr_b32 s69, s16, 16
+; VI-NEXT: s_lshr_b32 s70, s16, 8
+; VI-NEXT: s_lshr_b32 s71, s19, 24
+; VI-NEXT: s_lshr_b32 s80, s19, 16
+; VI-NEXT: s_lshr_b32 s81, s19, 8
+; VI-NEXT: s_lshr_b32 s82, s18, 16
+; VI-NEXT: s_lshr_b32 s83, s18, 8
+; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24
+; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24
+; VI-NEXT: s_mov_b32 s20, s19
+; VI-NEXT: s_mov_b32 s22, s17
+; VI-NEXT: s_mov_b32 s24, s15
+; VI-NEXT: s_mov_b32 s26, s13
+; VI-NEXT: s_mov_b32 s28, s11
+; VI-NEXT: s_mov_b32 s40, s9
+; VI-NEXT: s_mov_b32 s46, s7
+; VI-NEXT: s_mov_b32 s56, s5
+; VI-NEXT: s_cbranch_execnz .LBB109_3
; VI-NEXT: .LBB109_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s6, s17, 16
-; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s6, v15
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s6, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s6, v15
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: s_lshl_b32 s20, s19, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v2, s20, v1
+; VI-NEXT: v_readfirstlane_b32 s20, v2
+; VI-NEXT: s_bfe_u32 s21, s20, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s20
+; VI-NEXT: s_add_i32 s22, s21, 0x7fff
+; VI-NEXT: s_or_b32 s23, s20, 0x400000
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s6, s16, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s6, v15
-; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s6, s19, 16
-; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s6, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s6, v15
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s6, s18, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: s_lshl_b32 s6, s21, 16
-; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_and_b32 s6, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_f32_e32 v6, s6, v15
-; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: s_lshl_b32 s6, s20, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: s_lshl_b32 s6, s23, 16
-; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_add_f32_e32 v8, s6, v15
-; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s6, s22, 16
-; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: s_lshl_b32 s6, s25, 16
-; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_add_f32_e32 v10, s6, v15
-; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: s_lshl_b32 s6, s24, 16
-; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: s_lshl_b32 s6, s27, 16
-; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_add_f32_e32 v12, s6, v15
-; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: s_lshl_b32 s6, s26, 16
-; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: s_lshl_b32 s6, s29, 16
-; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT: v_add_f32_e32 v14, s6, v15
-; VI-NEXT: v_bfe_u32 v16, v14, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: s_lshl_b32 s6, s28, 16
-; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v16, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: s_and_b32 s6, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s6, v15
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s6, s5, 16
-; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16
-; VI-NEXT: v_add_f32_e32 v16, s6, v15
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: s_and_b64 s[20:21], vcc, exec
+; VI-NEXT: s_cselect_b32 s20, s23, s22
+; VI-NEXT: s_and_b32 s19, s19, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s19, v1
+; VI-NEXT: v_readfirstlane_b32 s19, v2
+; VI-NEXT: s_bfe_u32 s21, s19, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s19
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s19, 22
+; VI-NEXT: s_and_b64 s[22:23], vcc, exec
+; VI-NEXT: s_cselect_b32 s19, s19, s21
+; VI-NEXT: s_lshr_b32 s21, s19, 16
+; VI-NEXT: s_lshl_b32 s19, s18, 16
+; VI-NEXT: v_add_f32_e32 v2, s19, v1
+; VI-NEXT: s_lshr_b64 s[20:21], s[20:21], 16
+; VI-NEXT: v_readfirstlane_b32 s19, v2
+; VI-NEXT: s_bfe_u32 s21, s19, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s19
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s19, 22
+; VI-NEXT: s_and_b64 s[22:23], vcc, exec
+; VI-NEXT: s_cselect_b32 s22, s19, s21
+; VI-NEXT: s_and_b32 s18, s18, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s18, v1
+; VI-NEXT: v_readfirstlane_b32 s18, v2
+; VI-NEXT: s_bfe_u32 s19, s18, 0x10010
+; VI-NEXT: s_add_i32 s19, s19, s18
+; VI-NEXT: s_add_i32 s21, s19, 0x7fff
+; VI-NEXT: s_or_b32 s23, s18, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[18:19], vcc, exec
+; VI-NEXT: s_cselect_b32 s18, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s17, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: s_lshr_b32 s23, s18, 16
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_lshr_b64 s[18:19], s[22:23], 16
+; VI-NEXT: s_bfe_u32 s22, s21, 0x10010
+; VI-NEXT: s_add_i32 s22, s22, s21
+; VI-NEXT: s_add_i32 s24, s22, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[22:23], vcc, exec
+; VI-NEXT: s_cselect_b32 s22, s21, s24
+; VI-NEXT: s_and_b32 s17, s17, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s17, v1
+; VI-NEXT: v_readfirstlane_b32 s17, v2
+; VI-NEXT: s_bfe_u32 s21, s17, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s17
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s17, 22
+; VI-NEXT: s_and_b64 s[24:25], vcc, exec
+; VI-NEXT: s_cselect_b32 s17, s17, s21
+; VI-NEXT: s_lshr_b32 s23, s17, 16
+; VI-NEXT: s_lshl_b32 s17, s16, 16
+; VI-NEXT: v_add_f32_e32 v2, s17, v1
+; VI-NEXT: v_readfirstlane_b32 s17, v2
+; VI-NEXT: s_bfe_u32 s21, s17, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s17
+; VI-NEXT: s_lshr_b64 s[22:23], s[22:23], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s17, 22
+; VI-NEXT: s_and_b64 s[24:25], vcc, exec
+; VI-NEXT: s_cselect_b32 s24, s17, s21
+; VI-NEXT: s_and_b32 s16, s16, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s16, v1
+; VI-NEXT: v_readfirstlane_b32 s16, v2
+; VI-NEXT: s_bfe_u32 s17, s16, 0x10010
+; VI-NEXT: s_add_i32 s17, s17, s16
+; VI-NEXT: s_add_i32 s21, s17, 0x7fff
+; VI-NEXT: s_or_b32 s23, s16, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[16:17], vcc, exec
+; VI-NEXT: s_cselect_b32 s16, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s15, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s25, s16, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[16:17], s[24:25], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[24:25], vcc, exec
+; VI-NEXT: s_cselect_b32 s24, s21, s23
+; VI-NEXT: s_and_b32 s15, s15, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s15, v1
+; VI-NEXT: v_readfirstlane_b32 s15, v2
+; VI-NEXT: s_bfe_u32 s21, s15, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s15
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s15, 22
+; VI-NEXT: s_and_b64 s[26:27], vcc, exec
+; VI-NEXT: s_cselect_b32 s15, s15, s21
+; VI-NEXT: s_lshr_b32 s25, s15, 16
+; VI-NEXT: s_lshl_b32 s15, s14, 16
+; VI-NEXT: v_add_f32_e32 v2, s15, v1
+; VI-NEXT: v_readfirstlane_b32 s15, v2
+; VI-NEXT: s_bfe_u32 s21, s15, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s15
+; VI-NEXT: s_lshr_b64 s[24:25], s[24:25], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s15, 22
+; VI-NEXT: s_and_b64 s[26:27], vcc, exec
+; VI-NEXT: s_cselect_b32 s26, s15, s21
+; VI-NEXT: s_and_b32 s14, s14, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s14, v1
+; VI-NEXT: v_readfirstlane_b32 s14, v2
+; VI-NEXT: s_bfe_u32 s15, s14, 0x10010
+; VI-NEXT: s_add_i32 s15, s15, s14
+; VI-NEXT: s_add_i32 s21, s15, 0x7fff
+; VI-NEXT: s_or_b32 s23, s14, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[14:15], vcc, exec
+; VI-NEXT: s_cselect_b32 s14, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s13, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s27, s14, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[26:27], vcc, exec
+; VI-NEXT: s_cselect_b32 s26, s21, s23
+; VI-NEXT: s_and_b32 s13, s13, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s13, v1
+; VI-NEXT: v_readfirstlane_b32 s13, v2
+; VI-NEXT: s_bfe_u32 s21, s13, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s13
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s13, 22
+; VI-NEXT: s_and_b64 s[28:29], vcc, exec
+; VI-NEXT: s_cselect_b32 s13, s13, s21
+; VI-NEXT: s_lshr_b32 s27, s13, 16
+; VI-NEXT: s_lshl_b32 s13, s12, 16
+; VI-NEXT: v_add_f32_e32 v2, s13, v1
+; VI-NEXT: v_readfirstlane_b32 s13, v2
+; VI-NEXT: s_bfe_u32 s21, s13, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s13
+; VI-NEXT: s_lshr_b64 s[26:27], s[26:27], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s13, 22
+; VI-NEXT: s_and_b64 s[28:29], vcc, exec
+; VI-NEXT: s_cselect_b32 s28, s13, s21
+; VI-NEXT: s_and_b32 s12, s12, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s12, v1
+; VI-NEXT: v_readfirstlane_b32 s12, v2
+; VI-NEXT: s_bfe_u32 s13, s12, 0x10010
+; VI-NEXT: s_add_i32 s13, s13, s12
+; VI-NEXT: s_add_i32 s21, s13, 0x7fff
+; VI-NEXT: s_or_b32 s23, s12, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[12:13], vcc, exec
+; VI-NEXT: s_cselect_b32 s12, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s11, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s29, s12, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[28:29], vcc, exec
+; VI-NEXT: s_cselect_b32 s28, s21, s23
+; VI-NEXT: s_and_b32 s11, s11, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s11, v1
+; VI-NEXT: v_readfirstlane_b32 s11, v2
+; VI-NEXT: s_bfe_u32 s21, s11, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s11
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s11, 22
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s11, s11, s21
+; VI-NEXT: s_lshr_b32 s29, s11, 16
+; VI-NEXT: s_lshl_b32 s11, s10, 16
+; VI-NEXT: v_add_f32_e32 v2, s11, v1
+; VI-NEXT: v_readfirstlane_b32 s11, v2
+; VI-NEXT: s_bfe_u32 s21, s11, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s11
+; VI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s11, 22
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s40, s11, s21
+; VI-NEXT: s_and_b32 s10, s10, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s10, v1
+; VI-NEXT: v_readfirstlane_b32 s10, v2
+; VI-NEXT: s_bfe_u32 s11, s10, 0x10010
+; VI-NEXT: s_add_i32 s11, s11, s10
+; VI-NEXT: s_add_i32 s21, s11, 0x7fff
+; VI-NEXT: s_or_b32 s23, s10, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[10:11], vcc, exec
+; VI-NEXT: s_cselect_b32 s10, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s9, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s41, s10, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s40, s21, s23
+; VI-NEXT: s_and_b32 s9, s9, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s9, v1
+; VI-NEXT: v_readfirstlane_b32 s9, v2
+; VI-NEXT: s_bfe_u32 s21, s9, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s9
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s9, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s9, s9, s21
+; VI-NEXT: s_lshr_b32 s41, s9, 16
+; VI-NEXT: s_lshl_b32 s9, s8, 16
+; VI-NEXT: v_add_f32_e32 v2, s9, v1
+; VI-NEXT: v_readfirstlane_b32 s9, v2
+; VI-NEXT: s_bfe_u32 s21, s9, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s9
+; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s9, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s42, s9, s21
+; VI-NEXT: s_and_b32 s8, s8, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s8, v1
+; VI-NEXT: v_readfirstlane_b32 s8, v2
+; VI-NEXT: s_bfe_u32 s9, s8, 0x10010
+; VI-NEXT: s_add_i32 s9, s9, s8
+; VI-NEXT: s_add_i32 s21, s9, 0x7fff
+; VI-NEXT: s_or_b32 s23, s8, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-NEXT: s_cselect_b32 s8, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s7, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s43, s8, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[8:9], s[42:43], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s42, s21, s23
+; VI-NEXT: s_and_b32 s7, s7, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s7, v1
+; VI-NEXT: v_readfirstlane_b32 s7, v2
+; VI-NEXT: s_bfe_u32 s21, s7, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s7
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s7, 22
+; VI-NEXT: s_and_b64 s[44:45], vcc, exec
+; VI-NEXT: s_cselect_b32 s7, s7, s21
+; VI-NEXT: s_lshr_b32 s43, s7, 16
+; VI-NEXT: s_lshl_b32 s7, s6, 16
+; VI-NEXT: v_add_f32_e32 v2, s7, v1
+; VI-NEXT: v_readfirstlane_b32 s7, v2
+; VI-NEXT: s_bfe_u32 s21, s7, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s7
+; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s7, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s42, s7, s21
+; VI-NEXT: s_and_b32 s6, s6, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_add_i32 s21, s7, 0x7fff
+; VI-NEXT: s_or_b32 s23, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s23, s21
+; VI-NEXT: s_lshl_b32 s21, s5, 16
+; VI-NEXT: v_add_f32_e32 v2, s21, v1
+; VI-NEXT: v_readfirstlane_b32 s21, v2
+; VI-NEXT: s_bfe_u32 s23, s21, 0x10010
+; VI-NEXT: s_lshr_b32 s43, s6, 16
+; VI-NEXT: s_add_i32 s23, s23, s21
+; VI-NEXT: s_lshr_b64 s[6:7], s[42:43], 16
+; VI-NEXT: s_addk_i32 s23, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s21, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s42, s21, s23
; VI-NEXT: s_and_b32 s5, s5, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_add_f32_e32 v17, s5, v15
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v2
+; VI-NEXT: s_bfe_u32 s21, s5, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s5
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s5, 22
+; VI-NEXT: s_and_b64 s[44:45], vcc, exec
+; VI-NEXT: s_cselect_b32 s5, s5, s21
+; VI-NEXT: s_lshr_b32 s43, s5, 16
; VI-NEXT: s_lshl_b32 s5, s4, 16
-; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; VI-NEXT: v_add_f32_e32 v17, s5, v15
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
+; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v2
+; VI-NEXT: s_bfe_u32 s21, s5, 0x10010
+; VI-NEXT: s_add_i32 s21, s21, s5
+; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16
+; VI-NEXT: s_addk_i32 s21, 0x7fff
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_bitset1_b32 s5, 22
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s42, s5, s21
; VI-NEXT: s_and_b32 s4, s4, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_add_f32_e32 v15, s4, v15
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_bfe_u32 v18, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16
-; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1
-; VI-NEXT: s_branch .LBB109_5
-; VI-NEXT: .LBB109_3:
+; VI-NEXT: v_add_f32_e32 v1, s4, v1
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: s_bfe_u32 s5, s4, 0x10010
+; VI-NEXT: s_add_i32 s5, s5, s4
+; VI-NEXT: s_add_i32 s21, s5, 0x7fff
+; VI-NEXT: s_or_b32 s23, s4, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_cselect_b32 s4, s23, s21
+; VI-NEXT: s_lshr_b32 s43, s4, 16
+; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16
+; VI-NEXT: s_mov_b32 s19, s20
+; VI-NEXT: s_mov_b32 s17, s22
+; VI-NEXT: s_mov_b32 s15, s24
+; VI-NEXT: s_mov_b32 s13, s26
+; VI-NEXT: s_mov_b32 s11, s28
+; VI-NEXT: s_mov_b32 s9, s40
+; VI-NEXT: s_mov_b32 s7, s46
+; VI-NEXT: s_mov_b32 s5, s56
+; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24
+; VI-NEXT: s_lshr_b32 s21, s56, 24
+; VI-NEXT: s_lshr_b32 s23, s56, 16
+; VI-NEXT: s_lshr_b32 s25, s56, 8
+; VI-NEXT: s_lshr_b32 s27, s4, 16
+; VI-NEXT: s_lshr_b32 s29, s4, 8
+; VI-NEXT: s_lshr_b32 s41, s46, 24
+; VI-NEXT: s_lshr_b32 s47, s46, 16
+; VI-NEXT: s_lshr_b32 s57, s46, 8
+; VI-NEXT: s_lshr_b32 s88, s6, 16
+; VI-NEXT: s_lshr_b32 s89, s6, 8
+; VI-NEXT: s_lshr_b32 s90, s40, 24
+; VI-NEXT: s_lshr_b32 s91, s40, 16
+; VI-NEXT: s_lshr_b32 s30, s40, 8
+; VI-NEXT: s_lshr_b32 s31, s8, 16
+; VI-NEXT: s_lshr_b32 s34, s8, 8
+; VI-NEXT: s_lshr_b32 s35, s28, 24
+; VI-NEXT: s_lshr_b32 s36, s28, 16
+; VI-NEXT: s_lshr_b32 s37, s28, 8
+; VI-NEXT: s_lshr_b32 s38, s10, 16
+; VI-NEXT: s_lshr_b32 s39, s10, 8
+; VI-NEXT: s_lshr_b32 s48, s26, 24
+; VI-NEXT: s_lshr_b32 s49, s26, 16
+; VI-NEXT: s_lshr_b32 s50, s26, 8
+; VI-NEXT: s_lshr_b32 s51, s12, 16
+; VI-NEXT: s_lshr_b32 s52, s12, 8
+; VI-NEXT: s_lshr_b32 s53, s24, 24
+; VI-NEXT: s_lshr_b32 s54, s24, 16
+; VI-NEXT: s_lshr_b32 s55, s24, 8
+; VI-NEXT: s_lshr_b32 s64, s14, 16
+; VI-NEXT: s_lshr_b32 s65, s14, 8
+; VI-NEXT: s_lshr_b32 s66, s22, 24
+; VI-NEXT: s_lshr_b32 s67, s22, 16
+; VI-NEXT: s_lshr_b32 s68, s22, 8
+; VI-NEXT: s_lshr_b32 s69, s16, 16
+; VI-NEXT: s_lshr_b32 s70, s16, 8
+; VI-NEXT: s_lshr_b32 s71, s20, 24
+; VI-NEXT: s_lshr_b32 s80, s20, 16
+; VI-NEXT: s_lshr_b32 s81, s20, 8
+; VI-NEXT: s_lshr_b32 s82, s18, 16
+; VI-NEXT: s_lshr_b32 s83, s18, 8
+; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24
+; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24
+; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
+; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24
+; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24
+; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24
+; VI-NEXT: .LBB109_3: ; %end
+; VI-NEXT: s_and_b32 s5, s18, 0xff
+; VI-NEXT: s_lshl_b32 s7, s83, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_lshl_b32 s7, s76, 8
+; VI-NEXT: s_and_b32 s9, s82, 0xff
+; VI-NEXT: s_or_b32 s7, s9, s7
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_and_b32 s5, s20, 0xff
+; VI-NEXT: s_lshl_b32 s7, s81, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s80, 0xff
+; VI-NEXT: s_lshl_b32 s9, s71, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s16, 0xff
+; VI-NEXT: s_lshl_b32 s7, s70, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s69, 0xff
+; VI-NEXT: s_lshl_b32 s9, s74, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s22, 0xff
+; VI-NEXT: s_lshl_b32 s7, s68, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s67, 0xff
+; VI-NEXT: s_lshl_b32 s9, s66, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s14, 0xff
+; VI-NEXT: s_lshl_b32 s7, s65, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s64, 0xff
+; VI-NEXT: s_lshl_b32 s9, s72, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s24, 0xff
+; VI-NEXT: s_lshl_b32 s7, s55, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s54, 0xff
+; VI-NEXT: s_lshl_b32 s9, s53, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s12, 0xff
+; VI-NEXT: s_lshl_b32 s7, s52, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s51, 0xff
+; VI-NEXT: s_lshl_b32 s9, s62, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s26, 0xff
+; VI-NEXT: s_lshl_b32 s7, s50, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s49, 0xff
+; VI-NEXT: s_lshl_b32 s9, s48, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s10, 0xff
+; VI-NEXT: s_lshl_b32 s7, s39, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s38, 0xff
+; VI-NEXT: s_lshl_b32 s9, s60, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s28, 0xff
+; VI-NEXT: s_lshl_b32 s7, s37, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s36, 0xff
+; VI-NEXT: s_lshl_b32 s9, s35, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s8, 0xff
+; VI-NEXT: s_lshl_b32 s7, s34, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s31, 0xff
+; VI-NEXT: s_lshl_b32 s8, s58, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s40, 0xff
+; VI-NEXT: s_lshl_b32 s7, s30, 8
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_and_b32 s7, s91, 0xff
+; VI-NEXT: s_lshl_b32 s8, s90, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s6, 0xff
+; VI-NEXT: s_lshl_b32 s6, s89, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s6, s88, 0xff
+; VI-NEXT: s_lshl_b32 s7, s44, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s5, s46, 0xff
+; VI-NEXT: s_lshl_b32 s6, s57, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s6, s47, 0xff
+; VI-NEXT: s_lshl_b32 s7, s41, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_and_b32 s4, s4, 0xff
+; VI-NEXT: s_lshl_b32 s5, s29, 8
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s5, s27, 0xff
+; VI-NEXT: s_lshl_b32 s6, s42, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_and_b32 s4, s56, 0xff
+; VI-NEXT: s_lshl_b32 s5, s25, 8
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s5, s23, 0xff
+; VI-NEXT: s_lshl_b32 s6, s21, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: v_readlane_b32 s83, v18, 27
+; VI-NEXT: v_readlane_b32 s82, v18, 26
+; VI-NEXT: v_readlane_b32 s81, v18, 25
+; VI-NEXT: v_readlane_b32 s80, v18, 24
+; VI-NEXT: v_readlane_b32 s71, v18, 23
+; VI-NEXT: v_readlane_b32 s70, v18, 22
+; VI-NEXT: v_readlane_b32 s69, v18, 21
+; VI-NEXT: v_readlane_b32 s68, v18, 20
+; VI-NEXT: v_readlane_b32 s67, v18, 19
+; VI-NEXT: v_readlane_b32 s66, v18, 18
+; VI-NEXT: v_readlane_b32 s65, v18, 17
+; VI-NEXT: v_readlane_b32 s64, v18, 16
+; VI-NEXT: v_readlane_b32 s55, v18, 15
+; VI-NEXT: v_readlane_b32 s54, v18, 14
+; VI-NEXT: v_readlane_b32 s53, v18, 13
+; VI-NEXT: v_readlane_b32 s52, v18, 12
+; VI-NEXT: v_readlane_b32 s51, v18, 11
+; VI-NEXT: v_readlane_b32 s50, v18, 10
+; VI-NEXT: v_readlane_b32 s49, v18, 9
+; VI-NEXT: v_readlane_b32 s48, v18, 8
+; VI-NEXT: v_readlane_b32 s39, v18, 7
+; VI-NEXT: v_readlane_b32 s38, v18, 6
+; VI-NEXT: v_readlane_b32 s37, v18, 5
+; VI-NEXT: v_readlane_b32 s36, v18, 4
+; VI-NEXT: v_readlane_b32 s35, v18, 3
+; VI-NEXT: v_readlane_b32 s34, v18, 2
+; VI-NEXT: v_readlane_b32 s31, v18, 1
+; VI-NEXT: v_readlane_b32 s30, v18, 0
+; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB109_4:
+; VI-NEXT: ; implicit-def: $sgpr83
+; VI-NEXT: ; implicit-def: $sgpr82
+; VI-NEXT: ; implicit-def: $sgpr76
+; VI-NEXT: ; implicit-def: $sgpr20
+; VI-NEXT: ; implicit-def: $sgpr81
+; VI-NEXT: ; implicit-def: $sgpr80
+; VI-NEXT: ; implicit-def: $sgpr71
+; VI-NEXT: ; implicit-def: $sgpr70
+; VI-NEXT: ; implicit-def: $sgpr69
+; VI-NEXT: ; implicit-def: $sgpr74
+; VI-NEXT: ; implicit-def: $sgpr22
+; VI-NEXT: ; implicit-def: $sgpr68
; VI-NEXT: ; implicit-def: $sgpr67
-; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: ; implicit-def: $sgpr66
+; VI-NEXT: ; implicit-def: $sgpr65
; VI-NEXT: ; implicit-def: $sgpr64
+; VI-NEXT: ; implicit-def: $sgpr72
+; VI-NEXT: ; implicit-def: $sgpr24
; VI-NEXT: ; implicit-def: $sgpr55
; VI-NEXT: ; implicit-def: $sgpr54
-; VI-NEXT: ; implicit-def: $sgpr52
-; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: ; implicit-def: $sgpr53
+; VI-NEXT: ; implicit-def: $sgpr52
; VI-NEXT: ; implicit-def: $sgpr51
+; VI-NEXT: ; implicit-def: $sgpr62
+; VI-NEXT: ; implicit-def: $sgpr26
; VI-NEXT: ; implicit-def: $sgpr50
; VI-NEXT: ; implicit-def: $sgpr49
-; VI-NEXT: ; implicit-def: $sgpr39
-; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr48
+; VI-NEXT: ; implicit-def: $sgpr39
; VI-NEXT: ; implicit-def: $sgpr38
+; VI-NEXT: ; implicit-def: $sgpr60
+; VI-NEXT: ; implicit-def: $sgpr28
; VI-NEXT: ; implicit-def: $sgpr37
; VI-NEXT: ; implicit-def: $sgpr36
-; VI-NEXT: ; implicit-def: $sgpr34
-; VI-NEXT: ; implicit-def: $sgpr12
; VI-NEXT: ; implicit-def: $sgpr35
+; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr31
+; VI-NEXT: ; implicit-def: $sgpr58
+; VI-NEXT: ; implicit-def: $sgpr40
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr91
-; VI-NEXT: ; implicit-def: $sgpr89
-; VI-NEXT: ; implicit-def: $sgpr14
; VI-NEXT: ; implicit-def: $sgpr90
+; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr88
-; VI-NEXT: ; implicit-def: $sgpr79
-; VI-NEXT: ; implicit-def: $sgpr78
-; VI-NEXT: ; implicit-def: $sgpr76
-; VI-NEXT: ; implicit-def: $sgpr40
-; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr75
-; VI-NEXT: ; implicit-def: $sgpr74
-; VI-NEXT: ; implicit-def: $sgpr73
-; VI-NEXT: ; implicit-def: $sgpr63
-; VI-NEXT: ; implicit-def: $sgpr42
-; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr62
-; VI-NEXT: ; implicit-def: $sgpr61
-; VI-NEXT: ; implicit-def: $sgpr60
-; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: ; implicit-def: $sgpr44
-; VI-NEXT: ; implicit-def: $sgpr59
+; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; implicit-def: $sgpr57
+; VI-NEXT: ; implicit-def: $sgpr47
+; VI-NEXT: ; implicit-def: $sgpr41
+; VI-NEXT: ; implicit-def: $sgpr29
+; VI-NEXT: ; implicit-def: $sgpr27
+; VI-NEXT: ; implicit-def: $sgpr42
; VI-NEXT: ; implicit-def: $sgpr56
+; VI-NEXT: ; implicit-def: $sgpr25
+; VI-NEXT: ; implicit-def: $sgpr23
+; VI-NEXT: ; implicit-def: $sgpr21
; VI-NEXT: s_branch .LBB109_2
-; VI-NEXT: .LBB109_4:
-; VI-NEXT: v_mov_b32_e32 v19, s44
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v19, s42
-; VI-NEXT: v_mov_b32_e32 v1, s16
-; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_mov_b32_e32 v3, s18
-; VI-NEXT: v_mov_b32_e32 v4, s19
-; VI-NEXT: v_mov_b32_e32 v5, s20
-; VI-NEXT: v_mov_b32_e32 v6, s21
-; VI-NEXT: v_mov_b32_e32 v7, s22
-; VI-NEXT: v_mov_b32_e32 v8, s23
-; VI-NEXT: v_mov_b32_e32 v9, s24
-; VI-NEXT: v_mov_b32_e32 v10, s25
-; VI-NEXT: v_mov_b32_e32 v11, s26
-; VI-NEXT: v_mov_b32_e32 v12, s27
-; VI-NEXT: v_mov_b32_e32 v13, s28
-; VI-NEXT: v_mov_b32_e32 v14, s29
-; VI-NEXT: v_mov_b32_e32 v15, s4
-; VI-NEXT: v_mov_b32_e32 v16, s5
-; VI-NEXT: v_mov_b32_e32 v18, s67
-; VI-NEXT: v_mov_b32_e32 v62, s65
-; VI-NEXT: v_mov_b32_e32 v17, s66
-; VI-NEXT: v_mov_b32_e32 v60, s64
-; VI-NEXT: v_mov_b32_e32 v61, s55
-; VI-NEXT: v_mov_b32_e32 v58, s54
-; VI-NEXT: v_mov_b32_e32 v59, s52
-; VI-NEXT: v_mov_b32_e32 v57, s53
-; VI-NEXT: v_mov_b32_e32 v47, s51
-; VI-NEXT: v_mov_b32_e32 v56, s50
-; VI-NEXT: v_mov_b32_e32 v46, s49
-; VI-NEXT: v_mov_b32_e32 v45, s39
-; VI-NEXT: v_mov_b32_e32 v44, s48
-; VI-NEXT: v_mov_b32_e32 v42, s38
-; VI-NEXT: v_mov_b32_e32 v43, s37
-; VI-NEXT: v_mov_b32_e32 v41, s36
-; VI-NEXT: v_mov_b32_e32 v40, s34
-; VI-NEXT: v_mov_b32_e32 v55, s35
-; VI-NEXT: v_mov_b32_e32 v53, s31
-; VI-NEXT: v_mov_b32_e32 v54, s30
-; VI-NEXT: v_mov_b32_e32 v52, s91
-; VI-NEXT: v_mov_b32_e32 v51, s89
-; VI-NEXT: v_mov_b32_e32 v50, s90
-; VI-NEXT: v_mov_b32_e32 v48, s88
-; VI-NEXT: v_mov_b32_e32 v49, s79
-; VI-NEXT: v_mov_b32_e32 v39, s78
-; VI-NEXT: v_mov_b32_e32 v38, s76
-; VI-NEXT: v_mov_b32_e32 v37, s77
-; VI-NEXT: v_mov_b32_e32 v35, s75
-; VI-NEXT: v_mov_b32_e32 v36, s74
-; VI-NEXT: v_mov_b32_e32 v34, s73
-; VI-NEXT: v_mov_b32_e32 v33, s63
-; VI-NEXT: v_mov_b32_e32 v32, s72
-; VI-NEXT: v_mov_b32_e32 v30, s62
-; VI-NEXT: v_mov_b32_e32 v31, s61
-; VI-NEXT: v_mov_b32_e32 v29, s60
-; VI-NEXT: v_mov_b32_e32 v28, s58
-; VI-NEXT: v_mov_b32_e32 v27, s59
-; VI-NEXT: v_mov_b32_e32 v25, s57
-; VI-NEXT: v_mov_b32_e32 v26, s56
-; VI-NEXT: v_mov_b32_e32 v21, s12
-; VI-NEXT: v_mov_b32_e32 v22, s10
-; VI-NEXT: v_mov_b32_e32 v23, s8
-; VI-NEXT: v_mov_b32_e32 v24, s6
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v19, s40
-; VI-NEXT: v_mov_b32_e32 v20, s14
-; VI-NEXT: .LBB109_5: ; %end
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24
-; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56
-; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22
-; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
-; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21
-; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54
-; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20
-; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49
-; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19
-; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36
-; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34
-; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31
-; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v4, s30, 0
-; GFX9-NEXT: v_writelane_b32 v4, s31, 1
-; GFX9-NEXT: v_writelane_b32 v4, s34, 2
-; GFX9-NEXT: v_writelane_b32 v4, s35, 3
-; GFX9-NEXT: v_writelane_b32 v4, s36, 4
-; GFX9-NEXT: v_writelane_b32 v4, s37, 5
-; GFX9-NEXT: v_writelane_b32 v4, s38, 6
-; GFX9-NEXT: v_writelane_b32 v4, s39, 7
-; GFX9-NEXT: v_writelane_b32 v4, s48, 8
-; GFX9-NEXT: v_writelane_b32 v4, s49, 9
-; GFX9-NEXT: v_writelane_b32 v4, s50, 10
-; GFX9-NEXT: v_writelane_b32 v4, s51, 11
-; GFX9-NEXT: v_writelane_b32 v4, s52, 12
-; GFX9-NEXT: v_writelane_b32 v4, s53, 13
+; GFX9-NEXT: v_writelane_b32 v18, s30, 0
+; GFX9-NEXT: v_writelane_b32 v18, s31, 1
+; GFX9-NEXT: v_writelane_b32 v18, s34, 2
+; GFX9-NEXT: v_writelane_b32 v18, s35, 3
+; GFX9-NEXT: v_writelane_b32 v18, s36, 4
+; GFX9-NEXT: v_writelane_b32 v18, s37, 5
+; GFX9-NEXT: v_writelane_b32 v18, s38, 6
+; GFX9-NEXT: v_writelane_b32 v18, s39, 7
+; GFX9-NEXT: v_writelane_b32 v18, s48, 8
+; GFX9-NEXT: v_writelane_b32 v18, s49, 9
+; GFX9-NEXT: v_writelane_b32 v18, s50, 10
+; GFX9-NEXT: v_writelane_b32 v18, s51, 11
+; GFX9-NEXT: v_writelane_b32 v18, s52, 12
+; GFX9-NEXT: v_writelane_b32 v18, s53, 13
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-NEXT: v_mov_b32_e32 v8, s20
+; GFX9-NEXT: v_mov_b32_e32 v9, s21
+; GFX9-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-NEXT: v_mov_b32_e32 v12, s24
+; GFX9-NEXT: v_mov_b32_e32 v13, s25
+; GFX9-NEXT: v_mov_b32_e32 v14, s26
+; GFX9-NEXT: v_mov_b32_e32 v15, s27
+; GFX9-NEXT: v_mov_b32_e32 v16, s28
+; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v4, s54, 14
+; GFX9-NEXT: v_writelane_b32 v18, s54, 14
+; GFX9-NEXT: v_readfirstlane_b32 s18, v4
+; GFX9-NEXT: v_readfirstlane_b32 s19, v5
+; GFX9-NEXT: v_readfirstlane_b32 s16, v6
+; GFX9-NEXT: v_readfirstlane_b32 s17, v7
+; GFX9-NEXT: v_readfirstlane_b32 s14, v8
+; GFX9-NEXT: v_readfirstlane_b32 s15, v9
+; GFX9-NEXT: v_readfirstlane_b32 s12, v10
+; GFX9-NEXT: v_readfirstlane_b32 s13, v11
+; GFX9-NEXT: v_readfirstlane_b32 s10, v12
+; GFX9-NEXT: v_readfirstlane_b32 s11, v13
+; GFX9-NEXT: v_readfirstlane_b32 s8, v14
+; GFX9-NEXT: v_readfirstlane_b32 s9, v15
+; GFX9-NEXT: v_readfirstlane_b32 s6, v16
+; GFX9-NEXT: v_readfirstlane_b32 s7, v17
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_writelane_b32 v4, s55, 15
+; GFX9-NEXT: v_writelane_b32 v18, s55, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB109_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s92, s5, 24
@@ -88868,425 +89877,425 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_lshr_b32 s93, s5, 8
; GFX9-NEXT: s_lshr_b32 s94, s4, 16
; GFX9-NEXT: s_lshr_b32 s95, s4, 8
-; GFX9-NEXT: s_lshr_b32 s30, s29, 24
-; GFX9-NEXT: s_lshr_b32 s90, s29, 16
-; GFX9-NEXT: s_lshr_b32 s75, s29, 8
-; GFX9-NEXT: s_lshr_b32 s31, s28, 16
-; GFX9-NEXT: s_lshr_b32 s74, s28, 8
-; GFX9-NEXT: s_lshr_b32 s34, s27, 24
-; GFX9-NEXT: s_lshr_b32 s89, s27, 16
-; GFX9-NEXT: s_lshr_b32 s73, s27, 8
-; GFX9-NEXT: s_lshr_b32 s35, s26, 16
-; GFX9-NEXT: s_lshr_b32 s72, s26, 8
-; GFX9-NEXT: s_lshr_b32 s36, s25, 24
-; GFX9-NEXT: s_lshr_b32 s88, s25, 16
-; GFX9-NEXT: s_lshr_b32 s63, s25, 8
-; GFX9-NEXT: s_lshr_b32 s37, s24, 16
-; GFX9-NEXT: s_lshr_b32 s62, s24, 8
-; GFX9-NEXT: s_lshr_b32 s38, s23, 24
-; GFX9-NEXT: s_lshr_b32 s79, s23, 16
-; GFX9-NEXT: s_lshr_b32 s61, s23, 8
-; GFX9-NEXT: s_lshr_b32 s39, s22, 16
-; GFX9-NEXT: s_lshr_b32 s60, s22, 8
-; GFX9-NEXT: s_lshr_b32 s48, s21, 24
-; GFX9-NEXT: s_lshr_b32 s78, s21, 16
-; GFX9-NEXT: s_lshr_b32 s59, s21, 8
-; GFX9-NEXT: s_lshr_b32 s49, s20, 16
-; GFX9-NEXT: s_lshr_b32 s58, s20, 8
-; GFX9-NEXT: s_lshr_b32 s50, s19, 24
-; GFX9-NEXT: s_lshr_b32 s77, s19, 16
-; GFX9-NEXT: s_lshr_b32 s57, s19, 8
-; GFX9-NEXT: s_lshr_b32 s51, s18, 16
-; GFX9-NEXT: s_lshr_b32 s56, s18, 8
-; GFX9-NEXT: s_lshr_b32 s52, s17, 24
-; GFX9-NEXT: s_lshr_b32 s76, s17, 16
-; GFX9-NEXT: s_lshr_b32 s53, s17, 8
-; GFX9-NEXT: s_lshr_b32 s54, s16, 16
-; GFX9-NEXT: s_lshr_b32 s55, s16, 8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b32 s30, s7, 24
+; GFX9-NEXT: s_lshr_b32 s90, s7, 16
+; GFX9-NEXT: s_lshr_b32 s75, s7, 8
+; GFX9-NEXT: s_lshr_b32 s31, s6, 16
+; GFX9-NEXT: s_lshr_b32 s74, s6, 8
+; GFX9-NEXT: s_lshr_b32 s34, s9, 24
+; GFX9-NEXT: s_lshr_b32 s89, s9, 16
+; GFX9-NEXT: s_lshr_b32 s73, s9, 8
+; GFX9-NEXT: s_lshr_b32 s35, s8, 16
+; GFX9-NEXT: s_lshr_b32 s72, s8, 8
+; GFX9-NEXT: s_lshr_b32 s36, s11, 24
+; GFX9-NEXT: s_lshr_b32 s88, s11, 16
+; GFX9-NEXT: s_lshr_b32 s63, s11, 8
+; GFX9-NEXT: s_lshr_b32 s37, s10, 16
+; GFX9-NEXT: s_lshr_b32 s62, s10, 8
+; GFX9-NEXT: s_lshr_b32 s38, s13, 24
+; GFX9-NEXT: s_lshr_b32 s79, s13, 16
+; GFX9-NEXT: s_lshr_b32 s61, s13, 8
+; GFX9-NEXT: s_lshr_b32 s39, s12, 16
+; GFX9-NEXT: s_lshr_b32 s60, s12, 8
+; GFX9-NEXT: s_lshr_b32 s48, s15, 24
+; GFX9-NEXT: s_lshr_b32 s78, s15, 16
+; GFX9-NEXT: s_lshr_b32 s59, s15, 8
+; GFX9-NEXT: s_lshr_b32 s49, s14, 16
+; GFX9-NEXT: s_lshr_b32 s58, s14, 8
+; GFX9-NEXT: s_lshr_b32 s50, s17, 24
+; GFX9-NEXT: s_lshr_b32 s77, s17, 16
+; GFX9-NEXT: s_lshr_b32 s57, s17, 8
+; GFX9-NEXT: s_lshr_b32 s51, s16, 16
+; GFX9-NEXT: s_lshr_b32 s56, s16, 8
+; GFX9-NEXT: s_lshr_b32 s52, s19, 24
+; GFX9-NEXT: s_lshr_b32 s76, s19, 16
+; GFX9-NEXT: s_lshr_b32 s53, s19, 8
+; GFX9-NEXT: s_lshr_b32 s54, s18, 16
+; GFX9-NEXT: s_lshr_b32 s55, s18, 8
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24
; GFX9-NEXT: s_cbranch_execnz .LBB109_3
; GFX9-NEXT: .LBB109_2: ; %cmp.true
-; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000
+; GFX9-NEXT: s_and_b32 s20, s19, 0xffff0000
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s76, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s17, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s19, s19, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s19, v1
+; GFX9-NEXT: v_readfirstlane_b32 s19, v2
+; GFX9-NEXT: s_lshr_b32 s76, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s19, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s19
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s17, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s19, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s19, s19, s22
+; GFX9-NEXT: s_and_b32 s20, s18, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s19, s19, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s16, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s18, s18, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s18, v1
+; GFX9-NEXT: v_readfirstlane_b32 s18, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s18, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s18
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s16, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s18, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s18, s18, s23
+; GFX9-NEXT: s_and_b32 s20, s17, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s18, s18, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s46, s18, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s19, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s17, s17, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s17, v1
+; GFX9-NEXT: v_readfirstlane_b32 s17, v2
+; GFX9-NEXT: s_lshr_b32 s77, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s17, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s17
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s19, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s17, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s17, s17, s22
+; GFX9-NEXT: s_and_b32 s20, s16, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s17, s17, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s18, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s16, s16, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s16, v1
+; GFX9-NEXT: v_readfirstlane_b32 s16, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s16, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s16
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s18, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s16, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s16, s16, s23
+; GFX9-NEXT: s_and_b32 s20, s15, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s16, s16, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s56, s16, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s78, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s21, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s15, s15, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s15, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v2
+; GFX9-NEXT: s_lshr_b32 s78, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s15, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s15
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s21, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s15, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s15, s15, s22
+; GFX9-NEXT: s_and_b32 s20, s14, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s15, s15, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s20, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s14, s14, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s14, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s14, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s14
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s20, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s14, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s14, s14, s23
+; GFX9-NEXT: s_and_b32 s20, s13, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s14, s14, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s58, s14, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s79, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s23, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s13, s13, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s13, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v2
+; GFX9-NEXT: s_lshr_b32 s79, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s13, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s13
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s23, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s13, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s13, s13, s22
+; GFX9-NEXT: s_and_b32 s20, s12, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s13, s13, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s22, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s12, s12, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s12, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s12, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s12
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s22, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s12, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s12, s12, s23
+; GFX9-NEXT: s_and_b32 s20, s11, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s12, s12, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s60, s12, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s88, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s25, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s11, s11, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s11, v1
+; GFX9-NEXT: v_readfirstlane_b32 s11, v2
+; GFX9-NEXT: s_lshr_b32 s88, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s11, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s11
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s25, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s11, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s11, s11, s22
+; GFX9-NEXT: s_and_b32 s20, s10, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s11, s11, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s24, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s10, s10, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s10, v1
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s10, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s10
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s24, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s10, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s10, s10, s23
+; GFX9-NEXT: s_and_b32 s20, s9, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s10, s10, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s62, s10, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s89, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s27, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s9, v1
+; GFX9-NEXT: v_readfirstlane_b32 s9, v2
+; GFX9-NEXT: s_lshr_b32 s89, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s9, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s9
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s27, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s9, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s9, s9, s22
+; GFX9-NEXT: s_and_b32 s20, s8, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s9, s9, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s26, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s8, s8, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s8
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s26, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s8, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s8, s23
+; GFX9-NEXT: s_and_b32 s20, s7, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s8, s8, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s72, s8, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s90, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s29, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s7, v1
+; GFX9-NEXT: v_readfirstlane_b32 s7, v2
+; GFX9-NEXT: s_lshr_b32 s90, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s7, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s7
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s29, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s7, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s7, s7, s22
+; GFX9-NEXT: s_and_b32 s20, s6, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_lshr_b32 s7, s7, 16
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s28, 16
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
+; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s6, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s6
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s28, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bitset1_b32 s6, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s6, s6, s23
+; GFX9-NEXT: s_and_b32 s20, s5, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-NEXT: s_add_i32 s21, s21, s20
+; GFX9-NEXT: s_pack_ll_b32_b16 s74, s6, s22
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
; GFX9-NEXT: s_lshl_b32 s5, s5, 16
; GFX9-NEXT: v_add_f32_e32 v2, s5, v1
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: s_lshr_b32 s91, s6, 16
-; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010
-; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff
-; GFX9-NEXT: s_bitset1_b32 s5, 22
+; GFX9-NEXT: s_lshr_b32 s91, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s5, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s5
+; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s5, s5, s8
-; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
+; GFX9-NEXT: s_bitset1_b32 s5, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s5, s5, s22
+; GFX9-NEXT: s_and_b32 s20, s4, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s20, v1
+; GFX9-NEXT: v_readfirstlane_b32 s20, v2
+; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010
+; GFX9-NEXT: s_add_i32 s21, s21, s20
; GFX9-NEXT: s_lshr_b32 s5, s5, 16
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff
+; GFX9-NEXT: s_or_b32 s23, s20, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s20, s23, s22
; GFX9-NEXT: s_lshl_b32 s4, s4, 16
; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010
-; GFX9-NEXT: s_add_i32 s6, s6, s4
-; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff
-; GFX9-NEXT: s_bitset1_b32 s4, 22
+; GFX9-NEXT: s_lshr_b32 s22, s20, 16
+; GFX9-NEXT: s_bfe_u32 s20, s4, 0x10010
+; GFX9-NEXT: s_add_i32 s20, s20, s4
+; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s4, s4, s9
+; GFX9-NEXT: s_bitset1_b32 s4, 22
+; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s4, s4, s23
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78
-; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79
-; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88
-; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89
-; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90
+; GFX9-NEXT: s_pack_ll_b32_b16 s47, s19, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s57, s17, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s59, s15, s78
+; GFX9-NEXT: s_pack_ll_b32_b16 s61, s13, s79
+; GFX9-NEXT: s_pack_ll_b32_b16 s63, s11, s88
+; GFX9-NEXT: s_pack_ll_b32_b16 s73, s9, s89
+; GFX9-NEXT: s_pack_ll_b32_b16 s75, s7, s90
; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91
-; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24
+; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s22
+; GFX9-NEXT: s_lshr_b64 s[20:21], s[30:31], 24
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[74:75], 24
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[72:73], 24
+; GFX9-NEXT: s_lshr_b64 s[26:27], s[62:63], 24
+; GFX9-NEXT: s_lshr_b64 s[28:29], s[60:61], 24
; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24
; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24
@@ -89323,165 +90332,165 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_lshr_b32 s54, s46, 16
; GFX9-NEXT: s_lshr_b32 s55, s46, 8
; GFX9-NEXT: .LBB109_3: ; %end
-; GFX9-NEXT: s_and_b32 s7, s16, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s55, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s54, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s44, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s17, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s53, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s76, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s52, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s18, s18, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s55, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: s_and_b32 s21, s54, 0xff
+; GFX9-NEXT: s_lshl_b32 s23, s44, 8
+; GFX9-NEXT: s_or_b32 s21, s21, s23
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s21, s21, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s21
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s18, s19, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s53, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s19, s76, 0xff
+; GFX9-NEXT: s_lshl_b32 s21, s52, 8
+; GFX9-NEXT: s_or_b32 s19, s19, s21
+; GFX9-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX9-NEXT: s_lshl_b32 s19, s19, 16
+; GFX9-NEXT: s_or_b32 s18, s18, s19
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s18, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s56, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s51, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s42, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s56, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s18
+; GFX9-NEXT: s_and_b32 s18, s51, 0xff
+; GFX9-NEXT: s_lshl_b32 s19, s42, 8
+; GFX9-NEXT: s_or_b32 s18, s18, s19
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s18, s18, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s18
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s19, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s57, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s50, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s16, s17, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s57, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s17, s77, 0xff
+; GFX9-NEXT: s_lshl_b32 s18, s50, 8
+; GFX9-NEXT: s_or_b32 s17, s17, s18
+; GFX9-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX9-NEXT: s_lshl_b32 s17, s17, 16
+; GFX9-NEXT: s_or_b32 s16, s16, s17
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s20, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s58, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s49, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s40, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s58, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s16
+; GFX9-NEXT: s_and_b32 s16, s49, 0xff
+; GFX9-NEXT: s_lshl_b32 s17, s40, 8
+; GFX9-NEXT: s_or_b32 s16, s16, s17
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s16, s16, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s16
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s21, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s59, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s78, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s48, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s14, s15, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s59, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s15, s78, 0xff
+; GFX9-NEXT: s_lshl_b32 s16, s48, 8
+; GFX9-NEXT: s_or_b32 s15, s15, s16
+; GFX9-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX9-NEXT: s_lshl_b32 s15, s15, 16
+; GFX9-NEXT: s_or_b32 s14, s14, s15
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s22, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s60, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s39, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s14, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s60, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s14
+; GFX9-NEXT: s_and_b32 s14, s39, 0xff
+; GFX9-NEXT: s_lshl_b32 s15, s28, 8
+; GFX9-NEXT: s_or_b32 s14, s14, s15
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s14, s14, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s14
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s23, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s61, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s79, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s38, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s12, s13, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s61, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s13, s79, 0xff
+; GFX9-NEXT: s_lshl_b32 s14, s38, 8
+; GFX9-NEXT: s_or_b32 s13, s13, s14
+; GFX9-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX9-NEXT: s_lshl_b32 s13, s13, 16
+; GFX9-NEXT: s_or_b32 s12, s12, s13
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s24, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s62, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s37, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s12, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s62, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s12
+; GFX9-NEXT: s_and_b32 s12, s37, 0xff
+; GFX9-NEXT: s_lshl_b32 s13, s26, 8
+; GFX9-NEXT: s_or_b32 s12, s12, s13
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s12, s12, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s12
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s25, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s63, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s88, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s36, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s10, s11, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s63, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s11, s88, 0xff
+; GFX9-NEXT: s_lshl_b32 s12, s36, 8
+; GFX9-NEXT: s_or_b32 s11, s11, s12
+; GFX9-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX9-NEXT: s_lshl_b32 s11, s11, 16
+; GFX9-NEXT: s_or_b32 s10, s10, s11
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s26, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s72, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s35, 0xff
-; GFX9-NEXT: s_lshl_b32 s10, s10, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NEXT: s_and_b32 s8, s8, 0xff
+; GFX9-NEXT: s_lshl_b32 s10, s72, 8
+; GFX9-NEXT: s_or_b32 s8, s8, s10
+; GFX9-NEXT: s_and_b32 s10, s35, 0xff
+; GFX9-NEXT: s_lshl_b32 s11, s24, 8
+; GFX9-NEXT: s_or_b32 s10, s10, s11
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX9-NEXT: s_lshl_b32 s10, s10, 16
+; GFX9-NEXT: s_or_b32 s8, s8, s10
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s27, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s8, s9, 0xff
; GFX9-NEXT: s_lshl_b32 s9, s73, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: s_and_b32 s9, s89, 0xff
; GFX9-NEXT: s_lshl_b32 s10, s34, 8
; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s8, s8, 0xffff
; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s28, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s74, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s31, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s8, s9, s8
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s29, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s75, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: s_and_b32 s8, s90, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s30, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s74, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: s_and_b32 s8, s31, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s22, 8
; GFX9-NEXT: s_or_b32 s8, s8, s9
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s7, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s75, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s90, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s30, 8
; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s95, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s7
-; GFX9-NEXT: s_and_b32 s7, s94, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s6, s7, s6
+; GFX9-NEXT: s_lshl_b32 s6, s95, 8
+; GFX9-NEXT: s_or_b32 s4, s4, s6
+; GFX9-NEXT: s_and_b32 s6, s94, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s20, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
@@ -89499,24 +90508,24 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_readlane_b32 s55, v4, 15
-; GFX9-NEXT: v_readlane_b32 s54, v4, 14
-; GFX9-NEXT: v_readlane_b32 s53, v4, 13
-; GFX9-NEXT: v_readlane_b32 s52, v4, 12
-; GFX9-NEXT: v_readlane_b32 s51, v4, 11
-; GFX9-NEXT: v_readlane_b32 s50, v4, 10
-; GFX9-NEXT: v_readlane_b32 s49, v4, 9
-; GFX9-NEXT: v_readlane_b32 s48, v4, 8
-; GFX9-NEXT: v_readlane_b32 s39, v4, 7
-; GFX9-NEXT: v_readlane_b32 s38, v4, 6
-; GFX9-NEXT: v_readlane_b32 s37, v4, 5
-; GFX9-NEXT: v_readlane_b32 s36, v4, 4
-; GFX9-NEXT: v_readlane_b32 s35, v4, 3
-; GFX9-NEXT: v_readlane_b32 s34, v4, 2
-; GFX9-NEXT: v_readlane_b32 s31, v4, 1
-; GFX9-NEXT: v_readlane_b32 s30, v4, 0
+; GFX9-NEXT: v_readlane_b32 s55, v18, 15
+; GFX9-NEXT: v_readlane_b32 s54, v18, 14
+; GFX9-NEXT: v_readlane_b32 s53, v18, 13
+; GFX9-NEXT: v_readlane_b32 s52, v18, 12
+; GFX9-NEXT: v_readlane_b32 s51, v18, 11
+; GFX9-NEXT: v_readlane_b32 s50, v18, 10
+; GFX9-NEXT: v_readlane_b32 s49, v18, 9
+; GFX9-NEXT: v_readlane_b32 s48, v18, 8
+; GFX9-NEXT: v_readlane_b32 s39, v18, 7
+; GFX9-NEXT: v_readlane_b32 s38, v18, 6
+; GFX9-NEXT: v_readlane_b32 s37, v18, 5
+; GFX9-NEXT: v_readlane_b32 s36, v18, 4
+; GFX9-NEXT: v_readlane_b32 s35, v18, 3
+; GFX9-NEXT: v_readlane_b32 s34, v18, 2
+; GFX9-NEXT: v_readlane_b32 s31, v18, 1
+; GFX9-NEXT: v_readlane_b32 s30, v18, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -89541,31 +90550,31 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: ; implicit-def: $sgpr48
; GFX9-NEXT: ; implicit-def: $sgpr60
; GFX9-NEXT: ; implicit-def: $sgpr39
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr28
; GFX9-NEXT: ; implicit-def: $sgpr61
; GFX9-NEXT: ; implicit-def: $sgpr79
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr62
; GFX9-NEXT: ; implicit-def: $sgpr37
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr26
; GFX9-NEXT: ; implicit-def: $sgpr63
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr72
; GFX9-NEXT: ; implicit-def: $sgpr35
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr24
; GFX9-NEXT: ; implicit-def: $sgpr73
; GFX9-NEXT: ; implicit-def: $sgpr89
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr74
; GFX9-NEXT: ; implicit-def: $sgpr31
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr22
; GFX9-NEXT: ; implicit-def: $sgpr75
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr94
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr20
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr92