diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 22769 |
1 files changed, 11889 insertions, 10880 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 39da45b..3e2b488 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2144,112 +2144,110 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s5, s5, 3 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s63 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s40 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s61 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s60 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s12 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s59 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s10 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s58 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s8 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s57 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_mov_b32_e32 v29, s6 -; SI-NEXT: v_mov_b32_e32 v30, s5 -; SI-NEXT: v_mov_b32_e32 v31, s56 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: @@ -2882,111 +2880,139 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v32i16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: ; VI-NEXT: s_branch .LBB15_2 @@ -3403,124 +3429,152 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-LABEL: bitcast_v16i32_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s18, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s16, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v1 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s5, s20, 16 +; SI-NEXT: s_lshr_b32 s22, s19, 16 +; SI-NEXT: s_lshr_b32 s23, s18, 16 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: s_lshr_b32 s27, s14, 16 +; SI-NEXT: s_lshr_b32 s28, s13, 16 +; SI-NEXT: s_lshr_b32 s29, s12, 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s10, 16 +; SI-NEXT: s_lshr_b32 s42, s8, 16 +; SI-NEXT: s_lshr_b32 s43, s7, 16 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB17_3: ; %end @@ -4788,7 +4842,35 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s56, v3 +; SI-NEXT: v_readfirstlane_b32 s57, v4 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s60, v7 +; SI-NEXT: v_readfirstlane_b32 s61, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v9 +; SI-NEXT: v_readfirstlane_b32 s63, v10 +; SI-NEXT: v_readfirstlane_b32 s72, v11 +; SI-NEXT: v_readfirstlane_b32 s73, v12 +; SI-NEXT: v_readfirstlane_b32 s74, v13 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v15 +; SI-NEXT: v_readfirstlane_b32 s77, v16 ; SI-NEXT: v_readfirstlane_b32 s78, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s79, v1 @@ -4798,107 +4880,107 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s72, s72, 3 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s76, s76, 3 +; SI-NEXT: s_add_i32 s77, s77, 3 ; SI-NEXT: s_add_i32 s78, s78, 3 ; SI-NEXT: s_add_i32 s79, s79, 3 ; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s77 -; SI-NEXT: v_mov_b32_e32 v1, s76 -; SI-NEXT: v_mov_b32_e32 v2, s75 -; SI-NEXT: v_mov_b32_e32 v3, s74 -; SI-NEXT: v_mov_b32_e32 v4, s73 -; SI-NEXT: v_mov_b32_e32 v5, s72 -; SI-NEXT: v_mov_b32_e32 v6, s63 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s61 -; SI-NEXT: v_mov_b32_e32 v9, s60 -; SI-NEXT: v_mov_b32_e32 v10, s59 -; SI-NEXT: v_mov_b32_e32 v11, s58 -; SI-NEXT: v_mov_b32_e32 v12, s57 -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: v_mov_b32_e32 v14, s47 -; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v16, s45 -; SI-NEXT: v_mov_b32_e32 v17, s44 -; SI-NEXT: v_mov_b32_e32 v18, s43 -; SI-NEXT: v_mov_b32_e32 v19, s42 -; SI-NEXT: v_mov_b32_e32 v20, s41 -; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v21, s16 ; SI-NEXT: v_mov_b32_e32 v22, s15 ; SI-NEXT: v_mov_b32_e32 v23, s14 ; SI-NEXT: v_mov_b32_e32 v24, s13 @@ -4911,20 +4993,6 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: v_mov_b32_e32 v31, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 @@ -4933,6 +5001,20 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 @@ -6495,172 +6577,209 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -6669,687 +6788,665 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: s_cbranch_execnz .LBB23_3 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: s_branch .LBB23_5 -; VI-NEXT: .LBB23_3: -; VI-NEXT: s_branch .LBB23_2 -; VI-NEXT: .LBB23_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 +; VI-NEXT: v_mov_b32_e32 v15, v16 +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 ; GFX9-NEXT: .LBB23_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB23_5 -; GFX9-NEXT: .LBB23_3: -; GFX9-NEXT: s_branch .LBB23_2 -; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB23_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -8424,22 +8521,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -8475,6 +8556,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8757,22 +8854,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -8806,6 +8887,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9077,25 +9174,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9122,6 +9203,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9704,345 +9801,373 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s30, 0 -; SI-NEXT: v_writelane_b32 v4, s31, 1 -; SI-NEXT: v_writelane_b32 v4, s34, 2 -; SI-NEXT: v_writelane_b32 v4, s35, 3 -; SI-NEXT: v_writelane_b32 v4, s36, 4 -; SI-NEXT: v_writelane_b32 v4, s37, 5 -; SI-NEXT: v_writelane_b32 v4, s38, 6 -; SI-NEXT: v_writelane_b32 v4, s39, 7 -; SI-NEXT: v_writelane_b32 v4, s48, 8 -; SI-NEXT: v_writelane_b32 v4, s49, 9 -; SI-NEXT: v_writelane_b32 v4, s50, 10 -; SI-NEXT: v_writelane_b32 v4, s51, 11 -; SI-NEXT: v_writelane_b32 v4, s52, 12 -; SI-NEXT: v_writelane_b32 v4, s53, 13 -; SI-NEXT: v_writelane_b32 v4, s54, 14 -; SI-NEXT: v_writelane_b32 v4, s55, 15 -; SI-NEXT: v_writelane_b32 v4, s64, 16 -; SI-NEXT: v_writelane_b32 v4, s65, 17 -; SI-NEXT: v_writelane_b32 v4, s66, 18 -; SI-NEXT: v_writelane_b32 v4, s67, 19 -; SI-NEXT: v_writelane_b32 v4, s68, 20 -; SI-NEXT: v_writelane_b32 v4, s69, 21 -; SI-NEXT: v_writelane_b32 v4, s70, 22 -; SI-NEXT: v_writelane_b32 v4, s71, 23 -; SI-NEXT: v_writelane_b32 v4, s80, 24 -; SI-NEXT: v_writelane_b32 v4, s81, 25 -; SI-NEXT: v_writelane_b32 v4, s82, 26 -; SI-NEXT: v_writelane_b32 v4, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s20, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[18:19], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_writelane_b32 v4, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s21, 24 +; SI-NEXT: s_lshr_b32 s84, s21, 16 +; SI-NEXT: s_lshr_b32 s85, s21, 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s21, 24 +; SI-NEXT: s_lshr_b32 s84, s21, 16 +; SI-NEXT: s_lshr_b32 s85, s21, 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_lshl_b32 s7, s36, 8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s34, 0xff -; SI-NEXT: s_lshl_b32 s11, s30, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b32 s7, s17, 0xff -; SI-NEXT: s_lshl_b32 s9, s85, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s84, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s83, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_lshl_b32 s7, s94, 8 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s92, 0xff -; SI-NEXT: s_lshl_b32 s11, s90, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: s_and_b32 s7, s19, 0xff -; SI-NEXT: s_lshl_b32 s9, s82, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s81, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s80, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s19, s36, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s20, s34, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_lshl_b32 s20, s85, 8 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s84, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s21, s83, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: s_lshl_b32 s19, s94, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s19 +; SI-NEXT: s_and_b32 s19, s92, 0xff +; SI-NEXT: s_lshl_b32 s20, s90, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s19 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s82, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s80, 24 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s76, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s78, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s76, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s74, 24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s21, 0xff -; SI-NEXT: s_lshl_b32 s9, s71, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s70, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s69, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s71, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s88, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s72, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s62, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s72, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s62, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s23, 0xff -; SI-NEXT: s_lshl_b32 s9, s68, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s67, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s66, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s68, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s67, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s66, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s9, s60, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s56, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s58, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s56, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s25, 0xff -; SI-NEXT: s_lshl_b32 s9, s65, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s64, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s55, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s65, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s55, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s9, s46, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s44, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s46, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s44, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s42, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s54, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s53, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s52, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_lshl_b32 s10, s52, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s40, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s14, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s40, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s29, 0xff -; SI-NEXT: s_lshl_b32 s9, s51, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s50, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s49, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s51, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s50, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s49, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s7, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s7 -; SI-NEXT: s_and_b32 s7, s8, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s24, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s18, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -10062,38 +10187,38 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s85, v4, 29 -; SI-NEXT: v_readlane_b32 s84, v4, 28 -; SI-NEXT: v_readlane_b32 s83, v4, 27 -; SI-NEXT: v_readlane_b32 s82, v4, 26 -; SI-NEXT: v_readlane_b32 s81, v4, 25 -; SI-NEXT: v_readlane_b32 s80, v4, 24 -; SI-NEXT: v_readlane_b32 s71, v4, 23 -; SI-NEXT: v_readlane_b32 s70, v4, 22 -; SI-NEXT: v_readlane_b32 s69, v4, 21 -; SI-NEXT: v_readlane_b32 s68, v4, 20 -; SI-NEXT: v_readlane_b32 s67, v4, 19 -; SI-NEXT: v_readlane_b32 s66, v4, 18 -; SI-NEXT: v_readlane_b32 s65, v4, 17 -; SI-NEXT: v_readlane_b32 s64, v4, 16 -; SI-NEXT: v_readlane_b32 s55, v4, 15 -; SI-NEXT: v_readlane_b32 s54, v4, 14 -; SI-NEXT: v_readlane_b32 s53, v4, 13 -; SI-NEXT: v_readlane_b32 s52, v4, 12 -; SI-NEXT: v_readlane_b32 s51, v4, 11 -; SI-NEXT: v_readlane_b32 s50, v4, 10 -; SI-NEXT: v_readlane_b32 s49, v4, 9 -; SI-NEXT: v_readlane_b32 s48, v4, 8 -; SI-NEXT: v_readlane_b32 s39, v4, 7 -; SI-NEXT: v_readlane_b32 s38, v4, 6 -; SI-NEXT: v_readlane_b32 s37, v4, 5 -; SI-NEXT: v_readlane_b32 s36, v4, 4 -; SI-NEXT: v_readlane_b32 s35, v4, 3 -; SI-NEXT: v_readlane_b32 s34, v4, 2 -; SI-NEXT: v_readlane_b32 s31, v4, 1 -; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10141,43 +10266,71 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -10185,287 +10338,287 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB25_3 ; VI-NEXT: .LBB25_2: ; %cmp.true -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 ; VI-NEXT: s_add_i32 s5, s5, 3 ; VI-NEXT: s_add_i32 s4, s4, 3 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB25_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -10486,28 +10639,28 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -10532,31 +10685,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -10566,28 +10719,56 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -10595,275 +10776,275 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB25_3 ; GFX9-NEXT: .LBB25_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s17, s17, 3 -; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s19, s19, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_add_i32 s21, s21, 3 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_add_i32 s23, s23, 3 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_add_i32 s25, s25, 3 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s5, s5, 3 ; GFX9-NEXT: s_add_i32 s4, s4, 3 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: .LBB25_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s51, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s51, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s50, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s48, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s37, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s36, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s37, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s35, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s34, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s31, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s35, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s31, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s95, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s30, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s94, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s93, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s92, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s94, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s92, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s91, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s91, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s89, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s79, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s89, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s79, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s78, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s78, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s76, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s75, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s74, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s72, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s62, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s61, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s60, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s59, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -10881,24 +11062,24 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10923,31 +11104,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr57 ; GFX9-NEXT: ; implicit-def: $sgpr56 @@ -17793,111 +17974,139 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; VI-LABEL: bitcast_v32i16_to_v16f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB39_4: ; VI-NEXT: s_branch .LBB39_2 @@ -18307,79 +18516,91 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_mov_b32_e32 v55, s16 +; SI-NEXT: v_mov_b32_e32 v54, s17 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: v_mov_b32_e32 v52, s19 +; SI-NEXT: v_mov_b32_e32 v51, s20 +; SI-NEXT: v_mov_b32_e32 v50, s21 +; SI-NEXT: v_mov_b32_e32 v49, s22 +; SI-NEXT: v_mov_b32_e32 v48, s23 +; SI-NEXT: v_mov_b32_e32 v39, s24 +; SI-NEXT: v_mov_b32_e32 v38, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v35, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v30, s7, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 @@ -18392,10 +18613,10 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -18409,7 +18630,7 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -18424,13 +18645,15 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -21380,172 +21603,209 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -21554,687 +21814,665 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: s_cbranch_execnz .LBB47_3 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: s_branch .LBB47_5 -; VI-NEXT: .LBB47_3: -; VI-NEXT: s_branch .LBB47_2 -; VI-NEXT: .LBB47_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 +; VI-NEXT: v_mov_b32_e32 v15, v16 +; VI-NEXT: .LBB47_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: s_branch .LBB47_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 ; GFX9-NEXT: .LBB47_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB47_5 -; GFX9-NEXT: .LBB47_3: -; GFX9-NEXT: s_branch .LBB47_2 -; GFX9-NEXT: .LBB47_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB47_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB47_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: s_branch .LBB47_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -23309,22 +23547,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -23360,6 +23582,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23642,22 +23880,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -23691,6 +23913,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23962,25 +24200,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24007,6 +24229,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -24604,92 +24842,120 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v40, s81, 25 ; SI-NEXT: v_writelane_b32 v40, s82, 26 ; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s36, v1 +; SI-NEXT: v_readfirstlane_b32 s36, v4 +; SI-NEXT: v_readfirstlane_b32 s37, v5 +; SI-NEXT: v_readfirstlane_b32 s34, v6 +; SI-NEXT: v_readfirstlane_b32 s35, v7 +; SI-NEXT: v_readfirstlane_b32 s30, v8 +; SI-NEXT: v_readfirstlane_b32 s31, v9 +; SI-NEXT: v_readfirstlane_b32 s94, v10 +; SI-NEXT: v_readfirstlane_b32 s95, v11 +; SI-NEXT: v_readfirstlane_b32 s92, v12 +; SI-NEXT: v_readfirstlane_b32 s93, v13 +; SI-NEXT: v_readfirstlane_b32 s90, v14 +; SI-NEXT: v_readfirstlane_b32 s91, v15 +; SI-NEXT: v_readfirstlane_b32 s88, v16 +; SI-NEXT: v_readfirstlane_b32 s89, v17 +; SI-NEXT: v_readfirstlane_b32 s78, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s37, v2 +; SI-NEXT: v_readfirstlane_b32 s79, v2 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s82, s37, 24 -; SI-NEXT: s_lshr_b32 s84, s37, 16 -; SI-NEXT: s_lshr_b32 s85, s37, 8 -; SI-NEXT: s_lshr_b32 s71, s29, 24 -; SI-NEXT: s_lshr_b32 s81, s29, 16 -; SI-NEXT: s_lshr_b32 s83, s29, 8 -; SI-NEXT: s_lshr_b32 s68, s27, 24 -; SI-NEXT: s_lshr_b32 s70, s27, 16 -; SI-NEXT: s_lshr_b32 s80, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s67, s25, 16 -; SI-NEXT: s_lshr_b32 s69, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 24 -; SI-NEXT: s_lshr_b32 s64, s23, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 8 -; SI-NEXT: s_lshr_b32 s51, s21, 24 -; SI-NEXT: s_lshr_b32 s53, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s21, 8 -; SI-NEXT: s_lshr_b32 s48, s19, 24 -; SI-NEXT: s_lshr_b32 s50, s19, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 8 -; SI-NEXT: s_lshr_b32 s38, s17, 24 -; SI-NEXT: s_lshr_b32 s39, s17, 16 -; SI-NEXT: s_lshr_b32 s49, s17, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[36:37], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[36:37], 8 -; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s82, s79, 24 +; SI-NEXT: s_lshr_b32 s84, s79, 16 +; SI-NEXT: s_lshr_b32 s85, s79, 8 +; SI-NEXT: s_lshr_b32 s71, s89, 24 +; SI-NEXT: s_lshr_b32 s81, s89, 16 +; SI-NEXT: s_lshr_b32 s83, s89, 8 +; SI-NEXT: s_lshr_b32 s68, s91, 24 +; SI-NEXT: s_lshr_b32 s70, s91, 16 +; SI-NEXT: s_lshr_b32 s80, s91, 8 +; SI-NEXT: s_lshr_b32 s65, s93, 24 +; SI-NEXT: s_lshr_b32 s67, s93, 16 +; SI-NEXT: s_lshr_b32 s69, s93, 8 +; SI-NEXT: s_lshr_b32 s54, s95, 24 +; SI-NEXT: s_lshr_b32 s64, s95, 16 +; SI-NEXT: s_lshr_b32 s66, s95, 8 +; SI-NEXT: s_lshr_b32 s51, s31, 24 +; SI-NEXT: s_lshr_b32 s53, s31, 16 +; SI-NEXT: s_lshr_b32 s55, s31, 8 +; SI-NEXT: s_lshr_b32 s48, s35, 24 +; SI-NEXT: s_lshr_b32 s50, s35, 16 +; SI-NEXT: s_lshr_b32 s52, s35, 8 +; SI-NEXT: s_lshr_b32 s38, s37, 24 +; SI-NEXT: s_lshr_b32 s39, s37, 16 +; SI-NEXT: s_lshr_b32 s49, s37, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[78:79], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[78:79], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[78:79], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[88:89], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[88:89], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[88:89], 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[90:91], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[90:91], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[90:91], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[92:93], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[92:93], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[92:93], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[94:95], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[94:95], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[94:95], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[30:31], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[30:31], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[30:31], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[34:35], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[34:35], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[34:35], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[36:37], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[36:37], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[36:37], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v20, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v1, s37, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s36, 1.0 -; SI-NEXT: v_readfirstlane_b32 s16, v22 -; SI-NEXT: v_readfirstlane_b32 s17, v20 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s22, v10 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_readfirstlane_b32 s24, v8 -; SI-NEXT: v_readfirstlane_b32 s25, v7 -; SI-NEXT: v_readfirstlane_b32 s26, v6 -; SI-NEXT: v_readfirstlane_b32 s27, v5 +; SI-NEXT: v_add_f32_e64 v20, s37, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s36, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s35, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s34, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s31, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s30, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s95, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s94, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s93, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s92, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s91, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s90, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s89, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s88, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s79, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s78, 1.0 +; SI-NEXT: v_readfirstlane_b32 s76, v22 +; SI-NEXT: v_readfirstlane_b32 s77, v20 +; SI-NEXT: v_readfirstlane_b32 s62, v18 +; SI-NEXT: v_readfirstlane_b32 s63, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v11 +; SI-NEXT: v_readfirstlane_b32 s28, v10 +; SI-NEXT: v_readfirstlane_b32 s29, v9 +; SI-NEXT: v_readfirstlane_b32 s26, v8 +; SI-NEXT: v_readfirstlane_b32 s27, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v6 +; SI-NEXT: v_readfirstlane_b32 s21, v5 ; SI-NEXT: v_readfirstlane_b32 s14, v4 ; SI-NEXT: v_readfirstlane_b32 s15, v3 ; SI-NEXT: v_readfirstlane_b32 s8, v2 @@ -24700,24 +24966,24 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 24 ; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[76:77], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 ; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v1 @@ -24744,21 +25010,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v20 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr51 @@ -24777,15 +25043,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 @@ -24794,22 +25060,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v9, s23 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v7, s25 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v3, s29 -; SI-NEXT: v_mov_b32_e32 v2, s36 -; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v22, s36 +; SI-NEXT: v_mov_b32_e32 v20, s37 +; SI-NEXT: v_mov_b32_e32 v18, s34 +; SI-NEXT: v_mov_b32_e32 v16, s35 +; SI-NEXT: v_mov_b32_e32 v15, s30 +; SI-NEXT: v_mov_b32_e32 v11, s31 +; SI-NEXT: v_mov_b32_e32 v10, s94 +; SI-NEXT: v_mov_b32_e32 v9, s95 +; SI-NEXT: v_mov_b32_e32 v8, s92 +; SI-NEXT: v_mov_b32_e32 v7, s93 +; SI-NEXT: v_mov_b32_e32 v6, s90 +; SI-NEXT: v_mov_b32_e32 v5, s91 +; SI-NEXT: v_mov_b32_e32 v4, s88 +; SI-NEXT: v_mov_b32_e32 v3, s89 +; SI-NEXT: v_mov_b32_e32 v2, s78 +; SI-NEXT: v_mov_b32_e32 v1, s79 ; SI-NEXT: v_mov_b32_e32 v48, s49 ; SI-NEXT: v_mov_b32_e32 v39, s39 ; SI-NEXT: v_mov_b32_e32 v38, s38 @@ -24836,11 +25102,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v12, s82 ; SI-NEXT: .LBB49_5: ; %end ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: v_or_b32_e32 v22, s5, v22 -; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: s_and_b32 s5, s74, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s94, 24 +; SI-NEXT: s_lshl_b32 s7, s72, 24 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_or_b32_e32 v22, s5, v22 @@ -24851,15 +25117,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v38 ; SI-NEXT: v_or_b32_e32 v18, s5, v18 -; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: s_and_b32 s5, s60, 0xff ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v22, v38, v22 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s88, 24 +; SI-NEXT: s_lshl_b32 s7, s58, 24 ; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 @@ -24875,15 +25141,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v35 ; SI-NEXT: v_or_b32_e32 v15, s5, v15 -; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: s_and_b32 s5, s40, 0xff ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s60, 24 +; SI-NEXT: s_lshl_b32 s7, s28, 24 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 @@ -24899,15 +25165,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 ; SI-NEXT: v_or_b32_e32 v10, s5, v10 -; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: s_and_b32 s5, s46, 0xff ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s72, 24 +; SI-NEXT: s_lshl_b32 s7, s42, 24 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 @@ -24923,15 +25189,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v29 ; SI-NEXT: v_or_b32_e32 v8, s5, v8 -; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s22, 24 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -24947,15 +25213,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_lshl_b32 s5, s20, 8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v26 ; SI-NEXT: v_or_b32_e32 v6, s5, v6 -; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s40, 24 +; SI-NEXT: s_lshl_b32 s7, s16, 24 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 @@ -25086,10 +25352,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25113,75 +25407,75 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s5, 8 ; VI-NEXT: s_lshr_b32 s58, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s72, s7, 8 +; VI-NEXT: s_lshr_b32 s63, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s77, s9, 8 +; VI-NEXT: s_lshr_b32 s76, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s90, s11, 8 +; VI-NEXT: s_lshr_b32 s89, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s35, s13, 8 +; VI-NEXT: s_lshr_b32 s34, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s48, s15, 8 +; VI-NEXT: s_lshr_b32 s39, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s53, s17, 8 +; VI-NEXT: s_lshr_b32 s52, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s66, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true -; VI-NEXT: v_add_f32_e64 v6, s27, 1.0 -; VI-NEXT: v_add_f32_e64 v5, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s9, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s8, 1.0 ; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 ; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 -; VI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; VI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] -; VI-NEXT: v_add_f32_e64 v10, s23, 1.0 -; VI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_add_f32_e64 v12, s21, 1.0 -; VI-NEXT: v_add_f32_e64 v11, s20, 1.0 -; VI-NEXT: v_add_f32_e64 v4, s29, 1.0 -; VI-NEXT: v_add_f32_e64 v3, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] -; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s16, 1.0 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; VI-NEXT: v_add_f32_e64 v18, s17, 1.0 -; VI-NEXT: v_add_f32_e64 v17, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v18, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s18, 1.0 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -25230,31 +25524,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: .LBB49_3: ; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr35 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 @@ -25282,20 +25576,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v20, s42 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v11, s20 -; VI-NEXT: v_mov_b32_e32 v12, s21 -; VI-NEXT: v_mov_b32_e32 v9, s22 -; VI-NEXT: v_mov_b32_e32 v10, s23 -; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v8, s25 -; VI-NEXT: v_mov_b32_e32 v5, s26 -; VI-NEXT: v_mov_b32_e32 v6, s27 -; VI-NEXT: v_mov_b32_e32 v3, s28 -; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v6, s9 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v19, s67 @@ -25338,14 +25632,14 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v27, s59 ; VI-NEXT: v_mov_b32_e32 v14, s57 ; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v22, s12 -; VI-NEXT: v_mov_b32_e32 v23, s10 -; VI-NEXT: v_mov_b32_e32 v24, s8 -; VI-NEXT: v_mov_b32_e32 v25, s6 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s20 ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v20, s40 -; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: .LBB49_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 @@ -25434,21 +25728,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -25469,7 +25748,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25499,6 +25778,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -25526,10 +25820,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25553,76 +25875,76 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB49_4 ; GFX9-NEXT: .LBB49_2: ; %cmp.true -; GFX9-NEXT: v_add_f32_e64 v6, s27, 1.0 -; GFX9-NEXT: v_add_f32_e64 v5, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 ; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 -; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0 -; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0 -; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0 -; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0 -; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0 -; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s16, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v20, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v19, s18, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -25672,31 +25994,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -25725,20 +26047,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -25781,15 +26103,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB49_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -25867,21 +26189,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -25898,7 +26205,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25924,6 +26231,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -31414,112 +31736,110 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s4, 3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s63 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s40 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s61 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s60 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s12 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s59 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s10 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s58 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s8 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s57 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_mov_b32_e32 v29, s6 -; SI-NEXT: v_mov_b32_e32 v30, s5 -; SI-NEXT: v_mov_b32_e32 v31, s56 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: @@ -32152,111 +32472,139 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; VI-LABEL: bitcast_v32i16_to_v8i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB59_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_4: ; VI-NEXT: s_branch .LBB59_2 @@ -32677,108 +33025,136 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v8i64_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v3 +; SI-NEXT: v_readfirstlane_b32 s21, v4 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_readfirstlane_b32 s17, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_add_u32 s10, s18, 3 -; SI-NEXT: s_addc_u32 s11, s19, 0 -; SI-NEXT: s_lshr_b32 s12, s10, 16 -; SI-NEXT: s_lshr_b32 s13, s11, 16 -; SI-NEXT: s_add_u32 s14, s20, 3 -; SI-NEXT: s_addc_u32 s15, s21, 0 -; SI-NEXT: s_lshr_b32 s16, s14, 16 -; SI-NEXT: s_lshr_b32 s17, s15, 16 -; SI-NEXT: s_add_u32 s18, s22, 3 -; SI-NEXT: s_addc_u32 s19, s23, 0 -; SI-NEXT: s_lshr_b32 s20, s18, 16 -; SI-NEXT: s_lshr_b32 s21, s19, 16 -; SI-NEXT: s_add_u32 s22, s24, 3 -; SI-NEXT: s_addc_u32 s23, s25, 0 -; SI-NEXT: s_lshr_b32 s24, s22, 16 -; SI-NEXT: s_lshr_b32 s25, s23, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_u32 s4, s20, 3 +; SI-NEXT: s_addc_u32 s5, s21, 0 +; SI-NEXT: s_lshr_b32 s20, s4, 16 +; SI-NEXT: s_lshr_b32 s21, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s22, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s26, s14, 16 +; SI-NEXT: s_lshr_b32 s27, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s28, s12, 16 +; SI-NEXT: s_lshr_b32 s29, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 ; SI-NEXT: s_add_u32 s6, s6, 3 -; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 @@ -32787,16 +33163,16 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: @@ -34066,7 +34442,35 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s56, v3 +; SI-NEXT: v_readfirstlane_b32 s57, v4 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s60, v7 +; SI-NEXT: v_readfirstlane_b32 s61, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v9 +; SI-NEXT: v_readfirstlane_b32 s63, v10 +; SI-NEXT: v_readfirstlane_b32 s72, v11 +; SI-NEXT: v_readfirstlane_b32 s73, v12 +; SI-NEXT: v_readfirstlane_b32 s74, v13 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v15 +; SI-NEXT: v_readfirstlane_b32 s77, v16 ; SI-NEXT: v_readfirstlane_b32 s78, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s79, v1 @@ -34076,50 +34480,50 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_add_u32 s16, s18, 3 -; SI-NEXT: s_addc_u32 s17, s19, 0 -; SI-NEXT: s_add_u32 s18, s20, 3 -; SI-NEXT: s_addc_u32 s19, s21, 0 -; SI-NEXT: s_add_u32 s20, s22, 3 -; SI-NEXT: s_addc_u32 s21, s23, 0 -; SI-NEXT: s_add_u32 s22, s24, 3 -; SI-NEXT: s_addc_u32 s23, s25, 0 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s15, s27, 0 -; SI-NEXT: s_add_u32 s13, s28, 3 -; SI-NEXT: s_addc_u32 s11, s29, 0 +; SI-NEXT: s_add_u32 s4, s56, 3 +; SI-NEXT: s_addc_u32 s5, s57, 0 +; SI-NEXT: s_add_u32 s43, s58, 3 +; SI-NEXT: s_addc_u32 s41, s59, 0 +; SI-NEXT: s_add_u32 s29, s60, 3 +; SI-NEXT: s_addc_u32 s27, s61, 0 +; SI-NEXT: s_add_u32 s25, s62, 3 +; SI-NEXT: s_addc_u32 s23, s63, 0 +; SI-NEXT: s_add_u32 s21, s72, 3 +; SI-NEXT: s_addc_u32 s19, s73, 0 +; SI-NEXT: s_add_u32 s17, s74, 3 +; SI-NEXT: s_addc_u32 s15, s75, 0 +; SI-NEXT: s_add_u32 s13, s76, 3 +; SI-NEXT: s_addc_u32 s11, s77, 0 ; SI-NEXT: s_add_u32 s9, s78, 3 ; SI-NEXT: s_addc_u32 s7, s79, 0 ; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 @@ -34132,51 +34536,51 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_and_b32 s40, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s24, 16 -; SI-NEXT: s_and_b32 s42, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s23, 16 -; SI-NEXT: s_and_b32 s44, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s22, 16 -; SI-NEXT: s_and_b32 s46, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s21, 16 -; SI-NEXT: s_and_b32 s56, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s20, 16 -; SI-NEXT: s_and_b32 s58, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s19, 16 -; SI-NEXT: s_and_b32 s60, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s18, 16 -; SI-NEXT: s_and_b32 s62, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s17, 16 -; SI-NEXT: s_and_b32 s72, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s16, 16 -; SI-NEXT: s_and_b32 s74, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s5, 16 -; SI-NEXT: s_and_b32 s76, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s4, 16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s18, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s20, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s22, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_and_b32 s28, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_and_b32 s40, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_and_b32 s42, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_and_b32 s44, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s5, 16 +; SI-NEXT: s_and_b32 s46, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s77 -; SI-NEXT: v_mov_b32_e32 v1, s76 -; SI-NEXT: v_mov_b32_e32 v2, s75 -; SI-NEXT: v_mov_b32_e32 v3, s74 -; SI-NEXT: v_mov_b32_e32 v4, s73 -; SI-NEXT: v_mov_b32_e32 v5, s72 -; SI-NEXT: v_mov_b32_e32 v6, s63 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s61 -; SI-NEXT: v_mov_b32_e32 v9, s60 -; SI-NEXT: v_mov_b32_e32 v10, s59 -; SI-NEXT: v_mov_b32_e32 v11, s58 -; SI-NEXT: v_mov_b32_e32 v12, s57 -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: v_mov_b32_e32 v14, s47 -; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v16, s45 -; SI-NEXT: v_mov_b32_e32 v17, s44 -; SI-NEXT: v_mov_b32_e32 v18, s43 -; SI-NEXT: v_mov_b32_e32 v19, s42 -; SI-NEXT: v_mov_b32_e32 v20, s41 -; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v21, s16 ; SI-NEXT: v_mov_b32_e32 v22, s15 ; SI-NEXT: v_mov_b32_e32 v23, s14 ; SI-NEXT: v_mov_b32_e32 v24, s13 @@ -34189,20 +34593,6 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v31, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 @@ -34211,6 +34601,20 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 @@ -35773,172 +36177,209 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -35947,687 +36388,665 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: s_cbranch_execnz .LBB67_3 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: s_branch .LBB67_5 -; VI-NEXT: .LBB67_3: -; VI-NEXT: s_branch .LBB67_2 -; VI-NEXT: .LBB67_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 +; VI-NEXT: v_mov_b32_e32 v15, v16 +; VI-NEXT: .LBB67_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 ; GFX9-NEXT: .LBB67_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB67_5 -; GFX9-NEXT: .LBB67_3: -; GFX9-NEXT: s_branch .LBB67_2 -; GFX9-NEXT: .LBB67_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB67_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB67_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -37702,22 +38121,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -37753,6 +38156,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38035,22 +38454,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -38084,6 +38487,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -38355,25 +38774,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -38400,6 +38803,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -38992,345 +39411,373 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s30, 0 -; SI-NEXT: v_writelane_b32 v4, s31, 1 -; SI-NEXT: v_writelane_b32 v4, s34, 2 -; SI-NEXT: v_writelane_b32 v4, s35, 3 -; SI-NEXT: v_writelane_b32 v4, s36, 4 -; SI-NEXT: v_writelane_b32 v4, s37, 5 -; SI-NEXT: v_writelane_b32 v4, s38, 6 -; SI-NEXT: v_writelane_b32 v4, s39, 7 -; SI-NEXT: v_writelane_b32 v4, s48, 8 -; SI-NEXT: v_writelane_b32 v4, s49, 9 -; SI-NEXT: v_writelane_b32 v4, s50, 10 -; SI-NEXT: v_writelane_b32 v4, s51, 11 -; SI-NEXT: v_writelane_b32 v4, s52, 12 -; SI-NEXT: v_writelane_b32 v4, s53, 13 -; SI-NEXT: v_writelane_b32 v4, s54, 14 -; SI-NEXT: v_writelane_b32 v4, s55, 15 -; SI-NEXT: v_writelane_b32 v4, s64, 16 -; SI-NEXT: v_writelane_b32 v4, s65, 17 -; SI-NEXT: v_writelane_b32 v4, s66, 18 -; SI-NEXT: v_writelane_b32 v4, s67, 19 -; SI-NEXT: v_writelane_b32 v4, s68, 20 -; SI-NEXT: v_writelane_b32 v4, s69, 21 -; SI-NEXT: v_writelane_b32 v4, s70, 22 -; SI-NEXT: v_writelane_b32 v4, s71, 23 -; SI-NEXT: v_writelane_b32 v4, s80, 24 -; SI-NEXT: v_writelane_b32 v4, s81, 25 -; SI-NEXT: v_writelane_b32 v4, s82, 26 -; SI-NEXT: v_writelane_b32 v4, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[20:21], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_writelane_b32 v4, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_lshl_b32 s7, s36, 8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s34, 0xff -; SI-NEXT: s_lshl_b32 s11, s30, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b32 s7, s17, 0xff -; SI-NEXT: s_lshl_b32 s9, s85, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s84, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s83, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_lshl_b32 s7, s94, 8 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s92, 0xff -; SI-NEXT: s_lshl_b32 s11, s90, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: s_and_b32 s7, s19, 0xff -; SI-NEXT: s_lshl_b32 s9, s82, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s81, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s80, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s36, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s21, s34, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: s_lshl_b32 s19, s85, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s84, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s83, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s18, s94, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s92, 0xff +; SI-NEXT: s_lshl_b32 s19, s90, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s82, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s80, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s9, s76, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s72, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s62, 24 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s72, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s62, 24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s21, 0xff -; SI-NEXT: s_lshl_b32 s9, s71, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s70, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s69, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s71, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s88, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s78, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s78, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s74, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s23, 0xff -; SI-NEXT: s_lshl_b32 s9, s68, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s67, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s66, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s68, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s67, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s66, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s9, s60, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s56, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s58, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s56, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s25, 0xff -; SI-NEXT: s_lshl_b32 s9, s65, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s64, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s55, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s65, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s55, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s9, s46, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s44, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s46, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s44, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s42, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s54, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s53, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s52, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_lshl_b32 s10, s52, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s40, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s14, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s40, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s29, 0xff -; SI-NEXT: s_lshl_b32 s9, s51, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s50, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s49, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s51, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s50, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s49, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s7, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s7 -; SI-NEXT: s_and_b32 s7, s8, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s24, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s20, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -39350,38 +39797,38 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s85, v4, 29 -; SI-NEXT: v_readlane_b32 s84, v4, 28 -; SI-NEXT: v_readlane_b32 s83, v4, 27 -; SI-NEXT: v_readlane_b32 s82, v4, 26 -; SI-NEXT: v_readlane_b32 s81, v4, 25 -; SI-NEXT: v_readlane_b32 s80, v4, 24 -; SI-NEXT: v_readlane_b32 s71, v4, 23 -; SI-NEXT: v_readlane_b32 s70, v4, 22 -; SI-NEXT: v_readlane_b32 s69, v4, 21 -; SI-NEXT: v_readlane_b32 s68, v4, 20 -; SI-NEXT: v_readlane_b32 s67, v4, 19 -; SI-NEXT: v_readlane_b32 s66, v4, 18 -; SI-NEXT: v_readlane_b32 s65, v4, 17 -; SI-NEXT: v_readlane_b32 s64, v4, 16 -; SI-NEXT: v_readlane_b32 s55, v4, 15 -; SI-NEXT: v_readlane_b32 s54, v4, 14 -; SI-NEXT: v_readlane_b32 s53, v4, 13 -; SI-NEXT: v_readlane_b32 s52, v4, 12 -; SI-NEXT: v_readlane_b32 s51, v4, 11 -; SI-NEXT: v_readlane_b32 s50, v4, 10 -; SI-NEXT: v_readlane_b32 s49, v4, 9 -; SI-NEXT: v_readlane_b32 s48, v4, 8 -; SI-NEXT: v_readlane_b32 s39, v4, 7 -; SI-NEXT: v_readlane_b32 s38, v4, 6 -; SI-NEXT: v_readlane_b32 s37, v4, 5 -; SI-NEXT: v_readlane_b32 s36, v4, 4 -; SI-NEXT: v_readlane_b32 s35, v4, 3 -; SI-NEXT: v_readlane_b32 s34, v4, 2 -; SI-NEXT: v_readlane_b32 s31, v4, 1 -; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -39429,43 +39876,71 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB69_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -39473,287 +39948,287 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB69_3 ; VI-NEXT: .LBB69_2: ; %cmp.true -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 ; VI-NEXT: s_add_u32 s18, s18, 3 ; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s14, s14, 3 +; VI-NEXT: s_addc_u32 s15, s15, 0 +; VI-NEXT: s_add_u32 s12, s12, 3 +; VI-NEXT: s_addc_u32 s13, s13, 0 +; VI-NEXT: s_add_u32 s10, s10, 3 +; VI-NEXT: s_addc_u32 s11, s11, 0 +; VI-NEXT: s_add_u32 s8, s8, 3 +; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s4, s4, 3 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB69_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -39774,28 +40249,28 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -39820,31 +40295,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -39854,28 +40329,56 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -39883,275 +40386,275 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB69_3 ; GFX9-NEXT: .LBB69_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s16, s16, 3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_add_u32 s18, s18, 3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 -; GFX9-NEXT: s_add_u32 s20, s20, 3 -; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_add_u32 s22, s22, 3 -; GFX9-NEXT: s_addc_u32 s23, s23, 0 -; GFX9-NEXT: s_add_u32 s24, s24, 3 -; GFX9-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s4, s4, 3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: .LBB69_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s51, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s51, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s50, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s48, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s37, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s36, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s37, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s35, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s34, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s31, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s35, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s31, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s95, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s30, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s94, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s93, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s92, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s94, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s92, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s91, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s91, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s89, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s79, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s89, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s79, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s78, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s78, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s76, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s75, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s74, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s72, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s62, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s61, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s60, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s59, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -40169,24 +40672,24 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -40211,31 +40714,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr57 ; GFX9-NEXT: ; implicit-def: $sgpr56 @@ -45775,111 +46278,139 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; VI-LABEL: bitcast_v32i16_to_v8f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB75_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB75_4: ; VI-NEXT: s_branch .LBB75_2 @@ -46253,102 +46784,114 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: s_lshr_b32 s6, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: s_lshr_b32 s6, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_add_f64 v[31:32], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -46356,19 +46899,21 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -49226,172 +49771,209 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -49400,687 +49982,665 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: s_cbranch_execnz .LBB83_3 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: s_branch .LBB83_5 -; VI-NEXT: .LBB83_3: -; VI-NEXT: s_branch .LBB83_2 -; VI-NEXT: .LBB83_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 +; VI-NEXT: v_mov_b32_e32 v15, v16 +; VI-NEXT: .LBB83_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 ; GFX9-NEXT: .LBB83_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB83_5 -; GFX9-NEXT: .LBB83_3: -; GFX9-NEXT: s_branch .LBB83_2 -; GFX9-NEXT: .LBB83_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB83_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB83_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -51147,22 +51707,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -51198,6 +51742,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -51472,22 +52032,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -51521,6 +52065,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -51784,25 +52344,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51829,6 +52373,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -52428,10 +52988,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s83, 27 ; SI-NEXT: v_writelane_b32 v40, s84, 28 ; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[20:21], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: v_writelane_b32 v40, s87, 31 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 @@ -52439,62 +53027,62 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s48, s5, 24 ; SI-NEXT: s_lshr_b32 s49, s5, 16 ; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s29, 24 -; SI-NEXT: s_lshr_b32 s52, s29, 16 -; SI-NEXT: s_lshr_b32 s53, s29, 8 -; SI-NEXT: s_lshr_b32 s54, s27, 24 -; SI-NEXT: s_lshr_b32 s55, s27, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s66, s25, 16 -; SI-NEXT: s_lshr_b32 s67, s25, 8 -; SI-NEXT: s_lshr_b32 s68, s23, 24 -; SI-NEXT: s_lshr_b32 s69, s23, 16 -; SI-NEXT: s_lshr_b32 s70, s23, 8 -; SI-NEXT: s_lshr_b32 s71, s21, 24 -; SI-NEXT: s_lshr_b32 s80, s21, 16 -; SI-NEXT: s_lshr_b32 s81, s21, 8 -; SI-NEXT: s_lshr_b32 s82, s19, 24 -; SI-NEXT: s_lshr_b32 s83, s19, 16 -; SI-NEXT: s_lshr_b32 s84, s19, 8 -; SI-NEXT: s_lshr_b32 s85, s17, 24 -; SI-NEXT: s_lshr_b32 s86, s17, 16 -; SI-NEXT: s_lshr_b32 s87, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s51, s7, 24 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 8 +; SI-NEXT: s_lshr_b32 s54, s9, 24 +; SI-NEXT: s_lshr_b32 s55, s9, 16 +; SI-NEXT: s_lshr_b32 s64, s9, 8 +; SI-NEXT: s_lshr_b32 s65, s11, 24 +; SI-NEXT: s_lshr_b32 s66, s11, 16 +; SI-NEXT: s_lshr_b32 s67, s11, 8 +; SI-NEXT: s_lshr_b32 s68, s13, 24 +; SI-NEXT: s_lshr_b32 s69, s13, 16 +; SI-NEXT: s_lshr_b32 s70, s13, 8 +; SI-NEXT: s_lshr_b32 s71, s15, 24 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s15, 8 +; SI-NEXT: s_lshr_b32 s82, s17, 24 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s84, s17, 8 +; SI-NEXT: s_lshr_b32 s85, s19, 24 +; SI-NEXT: s_lshr_b32 s86, s19, 16 +; SI-NEXT: s_lshr_b32 s87, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 ; SI-NEXT: v_lshr_b64 v[48:49], v[28:29], 24 ; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[32:33], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 ; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[49:50], v[28:29], 16 @@ -52512,13 +53100,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[20:21], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16 -; SI-NEXT: v_readfirstlane_b32 s17, v33 -; SI-NEXT: v_readfirstlane_b32 s19, v29 -; SI-NEXT: v_readfirstlane_b32 s21, v21 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v6 -; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v33 +; SI-NEXT: v_readfirstlane_b32 s17, v29 +; SI-NEXT: v_readfirstlane_b32 s15, v21 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 8 ; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 24 @@ -52529,27 +53117,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s48, s5, 24 ; SI-NEXT: s_lshr_b32 s49, s5, 16 ; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s29, 24 -; SI-NEXT: s_lshr_b32 s52, s29, 16 -; SI-NEXT: s_lshr_b32 s53, s29, 8 -; SI-NEXT: s_lshr_b32 s54, s27, 24 -; SI-NEXT: s_lshr_b32 s55, s27, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s66, s25, 16 -; SI-NEXT: s_lshr_b32 s67, s25, 8 -; SI-NEXT: s_lshr_b32 s68, s23, 24 -; SI-NEXT: s_lshr_b32 s69, s23, 16 -; SI-NEXT: s_lshr_b32 s70, s23, 8 -; SI-NEXT: s_lshr_b32 s71, s21, 24 -; SI-NEXT: s_lshr_b32 s80, s21, 16 -; SI-NEXT: s_lshr_b32 s81, s21, 8 -; SI-NEXT: s_lshr_b32 s82, s19, 24 -; SI-NEXT: s_lshr_b32 s83, s19, 16 -; SI-NEXT: s_lshr_b32 s84, s19, 8 -; SI-NEXT: s_lshr_b32 s85, s17, 24 -; SI-NEXT: s_lshr_b32 s86, s17, 16 -; SI-NEXT: s_lshr_b32 s87, s17, 8 +; SI-NEXT: s_lshr_b32 s51, s7, 24 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 8 +; SI-NEXT: s_lshr_b32 s54, s9, 24 +; SI-NEXT: s_lshr_b32 s55, s9, 16 +; SI-NEXT: s_lshr_b32 s64, s9, 8 +; SI-NEXT: s_lshr_b32 s65, s11, 24 +; SI-NEXT: s_lshr_b32 s66, s11, 16 +; SI-NEXT: s_lshr_b32 s67, s11, 8 +; SI-NEXT: s_lshr_b32 s68, s13, 24 +; SI-NEXT: s_lshr_b32 s69, s13, 16 +; SI-NEXT: s_lshr_b32 s70, s13, 8 +; SI-NEXT: s_lshr_b32 s71, s15, 24 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s15, 8 +; SI-NEXT: s_lshr_b32 s82, s17, 24 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s84, s17, 8 +; SI-NEXT: s_lshr_b32 s85, s19, 24 +; SI-NEXT: s_lshr_b32 s86, s19, 16 +; SI-NEXT: s_lshr_b32 s87, s19, 8 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: ; SI-NEXT: ; implicit-def: $sgpr74 @@ -52565,8 +53153,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr71 @@ -52597,18 +53185,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v32, s16 -; SI-NEXT: v_mov_b32_e32 v28, s18 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v13, s12 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_mov_b32_e32 v53, s74 ; SI-NEXT: v_mov_b32_e32 v52, s62 @@ -52617,8 +53205,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v49, s44 ; SI-NEXT: v_mov_b32_e32 v48, s42 ; SI-NEXT: v_mov_b32_e32 v38, s40 -; SI-NEXT: v_mov_b32_e32 v37, s12 -; SI-NEXT: v_mov_b32_e32 v36, s8 +; SI-NEXT: v_mov_b32_e32 v37, s26 +; SI-NEXT: v_mov_b32_e32 v36, s22 ; SI-NEXT: v_mov_b32_e32 v35, s38 ; SI-NEXT: v_mov_b32_e32 v34, s36 ; SI-NEXT: v_mov_b32_e32 v30, s34 @@ -52631,13 +53219,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v17, s72 ; SI-NEXT: v_mov_b32_e32 v16, s60 ; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s20 ; SI-NEXT: .LBB85_5: ; %end ; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 -; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_and_b32 s4, s19, 0xff ; SI-NEXT: s_lshl_b32 s6, s87, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v52 @@ -52646,11 +53234,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s85, 24 +; SI-NEXT: s_lshl_b32 s8, s85, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen @@ -52661,7 +53249,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v50 -; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s6, s84, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v49 @@ -52670,11 +53258,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s82, 24 +; SI-NEXT: s_lshl_b32 s8, s82, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52686,7 +53274,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v38 -; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s6, s81, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v37 @@ -52695,11 +53283,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s71, 24 +; SI-NEXT: s_lshl_b32 s8, s71, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52711,7 +53299,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 -; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_and_b32 s4, s13, 0xff ; SI-NEXT: s_lshl_b32 s6, s70, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 @@ -52720,11 +53308,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s68, 24 +; SI-NEXT: s_lshl_b32 s8, s68, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52736,7 +53324,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 -; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_and_b32 s4, s11, 0xff ; SI-NEXT: s_lshl_b32 s6, s67, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v25 @@ -52745,11 +53333,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s65, 24 +; SI-NEXT: s_lshl_b32 s8, s65, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52761,7 +53349,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s6, s64, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v22 @@ -52770,11 +53358,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v18 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s54, 24 +; SI-NEXT: s_lshl_b32 s8, s54, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52785,7 +53373,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s6, s53, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 @@ -52895,10 +53483,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_writelane_b32 v40, s55, 15 ; VI-NEXT: v_writelane_b32 v40, s64, 16 ; VI-NEXT: v_writelane_b32 v40, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v40, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: v_writelane_b32 v40, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB85_3 @@ -52908,59 +53524,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s37, s4, 16 ; VI-NEXT: s_lshr_b32 s36, s4, 8 -; VI-NEXT: s_lshr_b32 s59, s29, 24 -; VI-NEXT: s_lshr_b32 s60, s29, 16 -; VI-NEXT: s_lshr_b32 s61, s29, 8 -; VI-NEXT: s_lshr_b32 s39, s28, 16 -; VI-NEXT: s_lshr_b32 s38, s28, 8 -; VI-NEXT: s_lshr_b32 s62, s27, 24 -; VI-NEXT: s_lshr_b32 s63, s27, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 8 -; VI-NEXT: s_lshr_b32 s49, s26, 16 -; VI-NEXT: s_lshr_b32 s48, s26, 8 -; VI-NEXT: s_lshr_b32 s73, s25, 24 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s25, 8 -; VI-NEXT: s_lshr_b32 s51, s24, 16 -; VI-NEXT: s_lshr_b32 s50, s24, 8 -; VI-NEXT: s_lshr_b32 s76, s23, 24 -; VI-NEXT: s_lshr_b32 s77, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 -; VI-NEXT: s_lshr_b32 s53, s22, 16 -; VI-NEXT: s_lshr_b32 s52, s22, 8 -; VI-NEXT: s_lshr_b32 s79, s21, 24 -; VI-NEXT: s_lshr_b32 s88, s21, 16 -; VI-NEXT: s_lshr_b32 s89, s21, 8 -; VI-NEXT: s_lshr_b32 s55, s20, 16 -; VI-NEXT: s_lshr_b32 s54, s20, 8 -; VI-NEXT: s_lshr_b32 s90, s19, 24 -; VI-NEXT: s_lshr_b32 s91, s19, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s64, s18, 8 -; VI-NEXT: s_lshr_b32 s31, s17, 24 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s17, 8 -; VI-NEXT: s_lshr_b32 s67, s16, 16 -; VI-NEXT: s_lshr_b32 s66, s16, 8 +; VI-NEXT: s_lshr_b32 s59, s7, 24 +; VI-NEXT: s_lshr_b32 s60, s7, 16 +; VI-NEXT: s_lshr_b32 s61, s7, 8 +; VI-NEXT: s_lshr_b32 s39, s6, 16 +; VI-NEXT: s_lshr_b32 s38, s6, 8 +; VI-NEXT: s_lshr_b32 s62, s9, 24 +; VI-NEXT: s_lshr_b32 s63, s9, 16 +; VI-NEXT: s_lshr_b32 s72, s9, 8 +; VI-NEXT: s_lshr_b32 s49, s8, 16 +; VI-NEXT: s_lshr_b32 s48, s8, 8 +; VI-NEXT: s_lshr_b32 s73, s11, 24 +; VI-NEXT: s_lshr_b32 s74, s11, 16 +; VI-NEXT: s_lshr_b32 s75, s11, 8 +; VI-NEXT: s_lshr_b32 s51, s10, 16 +; VI-NEXT: s_lshr_b32 s50, s10, 8 +; VI-NEXT: s_lshr_b32 s76, s13, 24 +; VI-NEXT: s_lshr_b32 s77, s13, 16 +; VI-NEXT: s_lshr_b32 s78, s13, 8 +; VI-NEXT: s_lshr_b32 s53, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s79, s15, 24 +; VI-NEXT: s_lshr_b32 s88, s15, 16 +; VI-NEXT: s_lshr_b32 s89, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s54, s14, 8 +; VI-NEXT: s_lshr_b32 s90, s17, 24 +; VI-NEXT: s_lshr_b32 s91, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s64, s16, 8 +; VI-NEXT: s_lshr_b32 s31, s19, 24 +; VI-NEXT: s_lshr_b32 s34, s19, 16 +; VI-NEXT: s_lshr_b32 s35, s19, 8 +; VI-NEXT: s_lshr_b32 s67, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] @@ -52968,13 +53584,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; VI-NEXT: v_readfirstlane_b32 s17, v14 -; VI-NEXT: v_readfirstlane_b32 s19, v10 -; VI-NEXT: v_readfirstlane_b32 s21, v16 -; VI-NEXT: v_readfirstlane_b32 s23, v12 -; VI-NEXT: v_readfirstlane_b32 s25, v8 -; VI-NEXT: v_readfirstlane_b32 s27, v6 -; VI-NEXT: v_readfirstlane_b32 s29, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v16 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s7, v4 ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -52982,70 +53598,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; VI-NEXT: s_lshr_b32 s59, s29, 24 -; VI-NEXT: s_lshr_b32 s60, s29, 16 -; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: s_lshr_b32 s59, s7, 24 +; VI-NEXT: s_lshr_b32 s60, s7, 16 +; VI-NEXT: s_lshr_b32 s61, s7, 8 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; VI-NEXT: s_lshr_b32 s62, s27, 24 -; VI-NEXT: s_lshr_b32 s63, s27, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: s_lshr_b32 s62, s9, 24 +; VI-NEXT: s_lshr_b32 s63, s9, 16 +; VI-NEXT: s_lshr_b32 s72, s9, 8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_lshr_b32 s73, s25, 24 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: s_lshr_b32 s73, s11, 24 +; VI-NEXT: s_lshr_b32 s74, s11, 16 +; VI-NEXT: s_lshr_b32 s75, s11, 8 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; VI-NEXT: s_lshr_b32 s76, s23, 24 -; VI-NEXT: s_lshr_b32 s77, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s76, s13, 24 +; VI-NEXT: s_lshr_b32 s77, s13, 16 +; VI-NEXT: s_lshr_b32 s78, s13, 8 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; VI-NEXT: s_lshr_b32 s79, s21, 24 -; VI-NEXT: s_lshr_b32 s88, s21, 16 -; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: s_lshr_b32 s79, s15, 24 +; VI-NEXT: s_lshr_b32 s88, s15, 16 +; VI-NEXT: s_lshr_b32 s89, s15, 8 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 -; VI-NEXT: s_lshr_b32 s90, s19, 24 -; VI-NEXT: s_lshr_b32 s91, s19, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: s_lshr_b32 s90, s17, 24 +; VI-NEXT: s_lshr_b32 s91, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 -; VI-NEXT: s_lshr_b32 s31, s17, 24 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: s_lshr_b32 s31, s19, 24 +; VI-NEXT: s_lshr_b32 s34, s19, 16 +; VI-NEXT: s_lshr_b32 s35, s19, 8 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 ; VI-NEXT: s_branch .LBB85_5 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr35 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 @@ -53069,8 +53685,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB85_2 ; VI-NEXT: .LBB85_4: -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v9, s16 ; VI-NEXT: v_mov_b32_e32 v48, s67 ; VI-NEXT: v_mov_b32_e32 v49, s66 ; VI-NEXT: v_mov_b32_e32 v38, s65 @@ -53087,27 +53703,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v29, s38 ; VI-NEXT: v_mov_b32_e32 v26, s37 ; VI-NEXT: v_mov_b32_e32 v27, s36 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v5, s26 -; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v23, s22 +; VI-NEXT: v_mov_b32_e32 v22, s24 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v20, s28 ; VI-NEXT: v_mov_b32_e32 v19, s40 ; VI-NEXT: v_mov_b32_e32 v18, s42 ; VI-NEXT: v_mov_b32_e32 v17, s44 ; VI-NEXT: .LBB85_5: ; %end -; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s6, s35, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s34, 0xff -; VI-NEXT: s_lshl_b32 s7, s31, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s8, s31, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53117,17 +53733,17 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s6, s30, 8 ; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s91, 0xff -; VI-NEXT: s_lshl_b32 s7, s90, 8 +; VI-NEXT: s_lshl_b32 s8, s90, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53137,16 +53753,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_and_b32 s4, s15, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff -; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: s_lshl_b32 s8, s79, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53156,16 +53772,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_and_b32 s4, s13, 0xff ; VI-NEXT: s_lshl_b32 s6, s78, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s77, 0xff -; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_lshl_b32 s8, s76, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53175,16 +53791,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_and_b32 s4, s11, 0xff ; VI-NEXT: s_lshl_b32 s6, s75, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s74, 0xff -; VI-NEXT: s_lshl_b32 s7, s73, 8 +; VI-NEXT: s_lshl_b32 s8, s73, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53194,16 +53810,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s27, 0xff +; VI-NEXT: s_and_b32 s4, s9, 0xff ; VI-NEXT: s_lshl_b32 s6, s72, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s63, 0xff -; VI-NEXT: s_lshl_b32 s7, s62, 8 +; VI-NEXT: s_lshl_b32 s8, s62, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53214,7 +53830,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s29, 0xff +; VI-NEXT: s_and_b32 s4, s7, 0xff ; VI-NEXT: s_lshl_b32 s6, s61, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 @@ -53299,10 +53915,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v40, s51, 11 ; GFX9-NEXT: v_writelane_b32 v40, s52, 12 ; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: v_writelane_b32 v40, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 @@ -53312,59 +53956,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s31, s4, 16 ; GFX9-NEXT: s_lshr_b32 s30, s4, 8 -; GFX9-NEXT: s_lshr_b32 s59, s29, 24 -; GFX9-NEXT: s_lshr_b32 s60, s29, 16 -; GFX9-NEXT: s_lshr_b32 s61, s29, 8 -; GFX9-NEXT: s_lshr_b32 s35, s28, 16 -; GFX9-NEXT: s_lshr_b32 s34, s28, 8 -; GFX9-NEXT: s_lshr_b32 s62, s27, 24 -; GFX9-NEXT: s_lshr_b32 s63, s27, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 8 -; GFX9-NEXT: s_lshr_b32 s37, s26, 16 -; GFX9-NEXT: s_lshr_b32 s36, s26, 8 -; GFX9-NEXT: s_lshr_b32 s73, s25, 24 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s25, 8 -; GFX9-NEXT: s_lshr_b32 s39, s24, 16 -; GFX9-NEXT: s_lshr_b32 s38, s24, 8 -; GFX9-NEXT: s_lshr_b32 s76, s23, 24 -; GFX9-NEXT: s_lshr_b32 s77, s23, 16 -; GFX9-NEXT: s_lshr_b32 s78, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s48, s22, 8 -; GFX9-NEXT: s_lshr_b32 s79, s21, 24 -; GFX9-NEXT: s_lshr_b32 s88, s21, 16 -; GFX9-NEXT: s_lshr_b32 s89, s21, 8 -; GFX9-NEXT: s_lshr_b32 s51, s20, 16 -; GFX9-NEXT: s_lshr_b32 s50, s20, 8 -; GFX9-NEXT: s_lshr_b32 s90, s19, 24 -; GFX9-NEXT: s_lshr_b32 s91, s19, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 8 -; GFX9-NEXT: s_lshr_b32 s53, s18, 16 -; GFX9-NEXT: s_lshr_b32 s52, s18, 8 -; GFX9-NEXT: s_lshr_b32 s93, s17, 24 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s17, 8 -; GFX9-NEXT: s_lshr_b32 s55, s16, 16 -; GFX9-NEXT: s_lshr_b32 s54, s16, 8 +; GFX9-NEXT: s_lshr_b32 s59, s7, 24 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s7, 8 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 +; GFX9-NEXT: s_lshr_b32 s34, s6, 8 +; GFX9-NEXT: s_lshr_b32 s62, s9, 24 +; GFX9-NEXT: s_lshr_b32 s63, s9, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 8 +; GFX9-NEXT: s_lshr_b32 s37, s8, 16 +; GFX9-NEXT: s_lshr_b32 s36, s8, 8 +; GFX9-NEXT: s_lshr_b32 s73, s11, 24 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s11, 8 +; GFX9-NEXT: s_lshr_b32 s39, s10, 16 +; GFX9-NEXT: s_lshr_b32 s38, s10, 8 +; GFX9-NEXT: s_lshr_b32 s76, s13, 24 +; GFX9-NEXT: s_lshr_b32 s77, s13, 16 +; GFX9-NEXT: s_lshr_b32 s78, s13, 8 +; GFX9-NEXT: s_lshr_b32 s49, s12, 16 +; GFX9-NEXT: s_lshr_b32 s48, s12, 8 +; GFX9-NEXT: s_lshr_b32 s79, s15, 24 +; GFX9-NEXT: s_lshr_b32 s88, s15, 16 +; GFX9-NEXT: s_lshr_b32 s89, s15, 8 +; GFX9-NEXT: s_lshr_b32 s51, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s90, s17, 24 +; GFX9-NEXT: s_lshr_b32 s91, s17, 16 +; GFX9-NEXT: s_lshr_b32 s92, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s52, s16, 8 +; GFX9-NEXT: s_lshr_b32 s93, s19, 24 +; GFX9-NEXT: s_lshr_b32 s94, s19, 16 +; GFX9-NEXT: s_lshr_b32 s95, s19, 8 +; GFX9-NEXT: s_lshr_b32 s55, s18, 16 +; GFX9-NEXT: s_lshr_b32 s54, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] @@ -53372,13 +54016,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; GFX9-NEXT: v_readfirstlane_b32 s17, v14 -; GFX9-NEXT: v_readfirstlane_b32 s19, v12 -; GFX9-NEXT: v_readfirstlane_b32 s21, v16 -; GFX9-NEXT: v_readfirstlane_b32 s23, v10 -; GFX9-NEXT: v_readfirstlane_b32 s25, v8 -; GFX9-NEXT: v_readfirstlane_b32 s27, v6 -; GFX9-NEXT: v_readfirstlane_b32 s29, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v14 +; GFX9-NEXT: v_readfirstlane_b32 s17, v12 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v4 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -53386,70 +54030,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX9-NEXT: s_lshr_b32 s59, s29, 24 -; GFX9-NEXT: s_lshr_b32 s60, s29, 16 -; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: s_lshr_b32 s59, s7, 24 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s7, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX9-NEXT: s_lshr_b32 s62, s27, 24 -; GFX9-NEXT: s_lshr_b32 s63, s27, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: s_lshr_b32 s62, s9, 24 +; GFX9-NEXT: s_lshr_b32 s63, s9, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: s_lshr_b32 s73, s25, 24 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: s_lshr_b32 s73, s11, 24 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s11, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; GFX9-NEXT: s_lshr_b32 s76, s23, 24 -; GFX9-NEXT: s_lshr_b32 s77, s23, 16 -; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: s_lshr_b32 s76, s13, 24 +; GFX9-NEXT: s_lshr_b32 s77, s13, 16 +; GFX9-NEXT: s_lshr_b32 s78, s13, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 -; GFX9-NEXT: s_lshr_b32 s79, s21, 24 -; GFX9-NEXT: s_lshr_b32 s88, s21, 16 -; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: s_lshr_b32 s79, s15, 24 +; GFX9-NEXT: s_lshr_b32 s88, s15, 16 +; GFX9-NEXT: s_lshr_b32 s89, s15, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 -; GFX9-NEXT: s_lshr_b32 s90, s19, 24 -; GFX9-NEXT: s_lshr_b32 s91, s19, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: s_lshr_b32 s90, s17, 24 +; GFX9-NEXT: s_lshr_b32 s91, s17, 16 +; GFX9-NEXT: s_lshr_b32 s92, s17, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 -; GFX9-NEXT: s_lshr_b32 s93, s17, 24 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: s_lshr_b32 s93, s19, 24 +; GFX9-NEXT: s_lshr_b32 s94, s19, 16 +; GFX9-NEXT: s_lshr_b32 s95, s19, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 ; GFX9-NEXT: s_branch .LBB85_5 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 @@ -53473,8 +54117,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB85_2 ; GFX9-NEXT: .LBB85_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s16 -; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 ; GFX9-NEXT: v_mov_b32_e32 v39, s55 ; GFX9-NEXT: v_mov_b32_e32 v49, s54 ; GFX9-NEXT: v_mov_b32_e32 v37, s53 @@ -53491,29 +54135,29 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s34 ; GFX9-NEXT: v_mov_b32_e32 v26, s31 ; GFX9-NEXT: v_mov_b32_e32 v27, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v24, s6 -; GFX9-NEXT: v_mov_b32_e32 v23, s8 -; GFX9-NEXT: v_mov_b32_e32 v22, s10 -; GFX9-NEXT: v_mov_b32_e32 v21, s12 -; GFX9-NEXT: v_mov_b32_e32 v20, s14 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 +; GFX9-NEXT: v_mov_b32_e32 v22, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 ; GFX9-NEXT: v_mov_b32_e32 v19, s40 ; GFX9-NEXT: v_mov_b32_e32 v18, s42 ; GFX9-NEXT: v_mov_b32_e32 v17, s44 ; GFX9-NEXT: .LBB85_5: ; %end -; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_and_b32 s4, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s95, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s93, 8 +; GFX9-NEXT: s_lshl_b32 s8, s93, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53522,15 +54166,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s92, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s91, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s90, 8 +; GFX9-NEXT: s_lshl_b32 s8, s90, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53539,15 +54183,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_and_b32 s4, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s89, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s79, 8 +; GFX9-NEXT: s_lshl_b32 s8, s79, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53556,15 +54200,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_and_b32 s4, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s78, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_lshl_b32 s8, s76, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53573,15 +54217,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_and_b32 s4, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s75, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s74, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53590,15 +54234,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s27, 0xff +; GFX9-NEXT: s_and_b32 s4, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s72, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s63, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: s_lshl_b32 s8, s62, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53607,7 +54251,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s29, 0xff +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s61, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 @@ -58875,111 +59519,139 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i ; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s21, v3 +; VI-NEXT: v_readfirstlane_b32 s20, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s18, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s16, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s12, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s10, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_cbranch_scc0 .LBB89_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB89_3 ; VI-NEXT: .LBB89_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s22, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s23, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s24, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s25, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s26, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s27, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s28, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s29, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s40, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s41, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s42, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s45, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s9, s45, s9 ; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s7, s43, s7 +; VI-NEXT: s_or_b32 s8, s42, s8 +; VI-NEXT: s_or_b32 s10, s41, s10 +; VI-NEXT: s_or_b32 s11, s40, s11 +; VI-NEXT: s_or_b32 s12, s29, s12 +; VI-NEXT: s_or_b32 s13, s28, s13 +; VI-NEXT: s_or_b32 s14, s27, s14 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_or_b32 s16, s25, s16 +; VI-NEXT: s_or_b32 s17, s24, s17 +; VI-NEXT: s_or_b32 s18, s23, s18 +; VI-NEXT: s_or_b32 s19, s22, s19 +; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 ; VI-NEXT: .LBB89_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s21 +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v15, s9 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB89_4: ; VI-NEXT: s_branch .LBB89_2 @@ -60336,111 +61008,139 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s21, v3 +; VI-NEXT: v_readfirstlane_b32 s20, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s18, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s16, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s12, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s10, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_cbranch_scc0 .LBB93_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB93_3 ; VI-NEXT: .LBB93_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s22, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s23, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s24, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s25, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s26, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s27, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s28, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s29, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s40, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s41, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s42, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s45, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s9, s45, s9 ; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s7, s43, s7 +; VI-NEXT: s_or_b32 s8, s42, s8 +; VI-NEXT: s_or_b32 s10, s41, s10 +; VI-NEXT: s_or_b32 s11, s40, s11 +; VI-NEXT: s_or_b32 s12, s29, s12 +; VI-NEXT: s_or_b32 s13, s28, s13 +; VI-NEXT: s_or_b32 s14, s27, s14 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_or_b32 s16, s25, s16 +; VI-NEXT: s_or_b32 s17, s24, s17 +; VI-NEXT: s_or_b32 s18, s23, s18 +; VI-NEXT: s_or_b32 s19, s22, s19 +; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 ; VI-NEXT: .LBB93_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s21 +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v15, s9 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB93_4: ; VI-NEXT: s_branch .LBB93_2 @@ -61971,189 +62671,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 -; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v9, v41 +; SI-NEXT: v_mov_b32_e32 v13, v42 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v21, v44 +; SI-NEXT: v_mov_b32_e32 v25, v45 +; SI-NEXT: v_mov_b32_e32 v29, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -62170,695 +62885,395 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v5, v34 -; SI-NEXT: v_mov_b32_e32 v9, v35 -; SI-NEXT: v_mov_b32_e32 v13, v36 -; SI-NEXT: v_mov_b32_e32 v17, v37 -; SI-NEXT: v_mov_b32_e32 v21, v38 -; SI-NEXT: v_mov_b32_e32 v25, v50 -; SI-NEXT: v_mov_b32_e32 v29, v48 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v3, v53 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v11, v49 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB95_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: s_cbranch_execnz .LBB95_3 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 -; VI-NEXT: s_branch .LBB95_5 -; VI-NEXT: .LBB95_3: -; VI-NEXT: s_branch .LBB95_2 -; VI-NEXT: .LBB95_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_mov_b32_e32 v21, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_mov_b32_e32 v19, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v17, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: .LBB95_3: ; %end +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v5, v20 +; VI-NEXT: v_mov_b32_e32 v7, v19 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_4: +; VI-NEXT: s_branch .LBB95_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB95_4 -; GFX9-NEXT: .LBB95_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0 -; GFX9-NEXT: s_branch .LBB95_5 -; GFX9-NEXT: .LBB95_3: -; GFX9-NEXT: s_branch .LBB95_2 -; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 @@ -62872,17 +63287,305 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB95_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB95_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_3 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v15, v31, v32, v15 +; GFX9-NEXT: v_and_or_b32 v14, v30, v32, v14 +; GFX9-NEXT: v_and_or_b32 v13, v29, v32, v13 +; GFX9-NEXT: v_and_or_b32 v12, v28, v32, v12 +; GFX9-NEXT: v_and_or_b32 v11, v27, v32, v11 +; GFX9-NEXT: v_and_or_b32 v10, v26, v32, v10 +; GFX9-NEXT: v_and_or_b32 v9, v25, v32, v9 +; GFX9-NEXT: v_and_or_b32 v8, v24, v32, v8 +; GFX9-NEXT: v_and_or_b32 v7, v23, v32, v7 +; GFX9-NEXT: v_and_or_b32 v6, v22, v32, v6 +; GFX9-NEXT: v_and_or_b32 v5, v21, v32, v5 +; GFX9-NEXT: v_and_or_b32 v4, v20, v32, v4 +; GFX9-NEXT: v_and_or_b32 v3, v19, v32, v3 +; GFX9-NEXT: v_and_or_b32 v2, v18, v32, v2 +; GFX9-NEXT: v_and_or_b32 v1, v17, v32, v1 +; GFX9-NEXT: v_and_or_b32 v0, v16, v32, v0 +; GFX9-NEXT: .LBB95_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: s_branch .LBB95_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -64301,6 +65004,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -64317,18 +65032,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -64780,6 +65483,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -64796,8 +65501,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -65069,25 +65772,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -65114,6 +65801,22 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -66382,32 +67085,60 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -66415,351 +67146,351 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s16, 3 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s19, 3 -; VI-NEXT: s_add_i32 s16, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s18, 3 -; VI-NEXT: s_add_i32 s19, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s21, 3 -; VI-NEXT: s_add_i32 s18, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s20, 3 -; VI-NEXT: s_add_i32 s21, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s23, 3 -; VI-NEXT: s_add_i32 s20, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s22, 3 -; VI-NEXT: s_add_i32 s23, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s25, 3 -; VI-NEXT: s_add_i32 s22, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s24, 3 -; VI-NEXT: s_add_i32 s25, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s27, 3 -; VI-NEXT: s_add_i32 s24, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s26, 3 -; VI-NEXT: s_add_i32 s27, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s29, 3 -; VI-NEXT: s_add_i32 s26, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s28, 3 -; VI-NEXT: s_add_i32 s29, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_or_b32 s19, s20, s19 +; VI-NEXT: s_and_b32 s20, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_or_b32 s18, s20, s18 +; VI-NEXT: s_and_b32 s20, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_or_b32 s17, s20, s17 +; VI-NEXT: s_and_b32 s20, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s16, s20, s16 +; VI-NEXT: s_and_b32 s20, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_or_b32 s15, s20, s15 +; VI-NEXT: s_and_b32 s20, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_or_b32 s14, s20, s14 +; VI-NEXT: s_and_b32 s20, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_or_b32 s13, s20, s13 +; VI-NEXT: s_and_b32 s20, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_or_b32 s12, s20, s12 +; VI-NEXT: s_and_b32 s20, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_or_b32 s11, s20, s11 +; VI-NEXT: s_and_b32 s20, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s10, s20, s10 +; VI-NEXT: s_and_b32 s20, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s9, s20, s9 +; VI-NEXT: s_and_b32 s20, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s8, s20, s8 +; VI-NEXT: s_and_b32 s20, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s28, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; VI-NEXT: s_or_b32 s7, s20, s7 +; VI-NEXT: s_and_b32 s20, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s6, s20, s6 +; VI-NEXT: s_and_b32 s20, s5, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s5, 3 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: s_or_b32 s5, s20, s5 +; VI-NEXT: s_and_b32 s20, s4, 0xffff0000 ; VI-NEXT: s_add_i32 s4, s4, 3 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_or_b32 s4, s20, s4 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB97_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -66780,28 +67511,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -66826,31 +67557,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -66877,10 +67608,38 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -66904,76 +67663,76 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB97_4 ; GFX9-NEXT: .LBB97_2: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v6, s27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -67023,31 +67782,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: .LBB97_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -67076,20 +67835,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -67132,15 +67891,15 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB97_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -67218,21 +67977,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -67249,7 +67993,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -67275,6 +68019,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -74852,663 +75611,340 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB103_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: s_cbranch_execnz .LBB103_3 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 -; VI-NEXT: s_branch .LBB103_5 -; VI-NEXT: .LBB103_3: -; VI-NEXT: s_branch .LBB103_2 -; VI-NEXT: .LBB103_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB103_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_mov_b32_e32 v21, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_mov_b32_e32 v19, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v17, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: .LBB103_3: ; %end +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v5, v20 +; VI-NEXT: v_mov_b32_e32 v7, v19 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_4: +; VI-NEXT: s_branch .LBB103_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB103_4 -; GFX9-NEXT: .LBB103_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB103_5 -; GFX9-NEXT: .LBB103_3: -; GFX9-NEXT: s_branch .LBB103_2 -; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 @@ -75522,17 +75958,321 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB103_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB103_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_3 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_and_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_and_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_and_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_and_b32_sdwa v12, v32, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_and_b32_sdwa v11, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_and_b32_sdwa v10, v32, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_and_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_and_b32_sdwa v8, v32, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_and_b32_sdwa v7, v32, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_and_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_and_b32_sdwa v5, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_and_b32_sdwa v4, v32, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_and_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v2, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v31, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v14, v30, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v29, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v28, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v27, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v26, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: .LBB103_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: s_branch .LBB103_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -76957,24 +77697,8 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -77002,6 +77726,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -77314,23 +78054,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 ; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77354,6 +78078,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -77626,25 +78366,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77671,6 +78395,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -78863,7 +79603,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -78884,10 +79624,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -78906,232 +79674,230 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB105_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s75, s5, 24 +; VI-NEXT: s_lshr_b32 s72, s5, 24 ; VI-NEXT: s_lshr_b32 s36, s5, 16 -; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s56, s5, 8 ; VI-NEXT: s_lshr_b32 s37, s4, 16 -; VI-NEXT: s_lshr_b32 s56, s4, 8 -; VI-NEXT: s_lshr_b32 s77, s29, 24 -; VI-NEXT: s_lshr_b32 s38, s29, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 8 -; VI-NEXT: s_lshr_b32 s39, s28, 16 -; VI-NEXT: s_lshr_b32 s57, s28, 8 -; VI-NEXT: s_lshr_b32 s79, s27, 24 -; VI-NEXT: s_lshr_b32 s48, s27, 16 -; VI-NEXT: s_lshr_b32 s74, s27, 8 -; VI-NEXT: s_lshr_b32 s49, s26, 16 -; VI-NEXT: s_lshr_b32 s59, s26, 8 -; VI-NEXT: s_lshr_b32 s89, s25, 24 -; VI-NEXT: s_lshr_b32 s50, s25, 16 -; VI-NEXT: s_lshr_b32 s76, s25, 8 -; VI-NEXT: s_lshr_b32 s51, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s24, 8 -; VI-NEXT: s_lshr_b32 s91, s23, 24 -; VI-NEXT: s_lshr_b32 s52, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 -; VI-NEXT: s_lshr_b32 s53, s22, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 8 -; VI-NEXT: s_lshr_b32 s31, s21, 24 -; VI-NEXT: s_lshr_b32 s54, s21, 16 -; VI-NEXT: s_lshr_b32 s88, s21, 8 -; VI-NEXT: s_lshr_b32 s55, s20, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 8 -; VI-NEXT: s_lshr_b32 s34, s19, 24 -; VI-NEXT: s_lshr_b32 s64, s19, 16 -; VI-NEXT: s_lshr_b32 s90, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s72, s18, 8 -; VI-NEXT: s_lshr_b32 s35, s17, 24 -; VI-NEXT: s_lshr_b32 s66, s17, 16 -; VI-NEXT: s_lshr_b32 s30, s17, 8 -; VI-NEXT: s_lshr_b32 s67, s16, 16 -; VI-NEXT: s_lshr_b32 s73, s16, 8 +; VI-NEXT: s_lshr_b32 s57, s4, 8 +; VI-NEXT: s_lshr_b32 s75, s7, 24 +; VI-NEXT: s_lshr_b32 s38, s7, 16 +; VI-NEXT: s_lshr_b32 s58, s7, 8 +; VI-NEXT: s_lshr_b32 s39, s6, 16 +; VI-NEXT: s_lshr_b32 s59, s6, 8 +; VI-NEXT: s_lshr_b32 s77, s9, 24 +; VI-NEXT: s_lshr_b32 s48, s9, 16 +; VI-NEXT: s_lshr_b32 s60, s9, 8 +; VI-NEXT: s_lshr_b32 s49, s8, 16 +; VI-NEXT: s_lshr_b32 s61, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s50, s11, 16 +; VI-NEXT: s_lshr_b32 s62, s11, 8 +; VI-NEXT: s_lshr_b32 s51, s10, 16 +; VI-NEXT: s_lshr_b32 s63, s10, 8 +; VI-NEXT: s_lshr_b32 s90, s13, 24 +; VI-NEXT: s_lshr_b32 s52, s13, 16 +; VI-NEXT: s_lshr_b32 s73, s13, 8 +; VI-NEXT: s_lshr_b32 s53, s12, 16 +; VI-NEXT: s_lshr_b32 s74, s12, 8 +; VI-NEXT: s_lshr_b32 s31, s15, 24 +; VI-NEXT: s_lshr_b32 s54, s15, 16 +; VI-NEXT: s_lshr_b32 s76, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s78, s14, 8 +; VI-NEXT: s_lshr_b32 s34, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s88, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s89, s16, 8 +; VI-NEXT: s_lshr_b32 s35, s19, 24 +; VI-NEXT: s_lshr_b32 s66, s19, 16 +; VI-NEXT: s_lshr_b32 s91, s19, 8 +; VI-NEXT: s_lshr_b32 s67, s18, 16 +; VI-NEXT: s_lshr_b32 s30, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB105_4 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: s_lshr_b32 s20, s19, 16 ; VI-NEXT: v_mov_b32_e32 v1, 0x200 -; VI-NEXT: v_add_f16_e32 v12, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_add_f16_e32 v12, s20, v1 +; VI-NEXT: v_add_f16_e32 v27, s19, v1 +; VI-NEXT: s_lshr_b32 s19, s18, 16 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: v_add_f16_e32 v27, s17, v1 -; VI-NEXT: v_add_f16_e32 v19, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_add_f16_e32 v19, s19, v1 +; VI-NEXT: v_add_f16_e32 v35, s18, v1 +; VI-NEXT: s_lshr_b32 s18, s17, 16 ; VI-NEXT: v_or_b32_e32 v10, v27, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; VI-NEXT: v_add_f16_e32 v35, s16, v1 -; VI-NEXT: v_add_f16_e32 v13, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_add_f16_e32 v13, s18, v1 +; VI-NEXT: v_add_f16_e32 v28, s17, v1 +; VI-NEXT: s_lshr_b32 s17, s16, 16 ; VI-NEXT: v_or_b32_e32 v9, v35, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; VI-NEXT: v_add_f16_e32 v28, s19, v1 -; VI-NEXT: v_add_f16_e32 v20, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: v_or_b32_e32 v62, v28, v2 +; VI-NEXT: v_add_f16_e32 v20, s17, v1 +; VI-NEXT: v_add_f16_e32 v36, s16, v1 +; VI-NEXT: s_lshr_b32 s16, s15, 16 +; VI-NEXT: v_or_b32_e32 v52, v28, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; VI-NEXT: v_add_f16_e32 v36, s18, v1 -; VI-NEXT: v_add_f16_e32 v14, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s20, 16 -; VI-NEXT: v_or_b32_e32 v61, v36, v2 +; VI-NEXT: v_add_f16_e32 v14, s16, v1 +; VI-NEXT: v_add_f16_e32 v29, s15, v1 +; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: v_or_b32_e32 v51, v36, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_add_f16_e32 v29, s21, v1 -; VI-NEXT: v_add_f16_e32 v21, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_add_f16_e32 v21, s15, v1 +; VI-NEXT: v_add_f16_e32 v37, s14, v1 +; VI-NEXT: s_lshr_b32 s14, s13, 16 ; VI-NEXT: v_or_b32_e32 v8, v29, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; VI-NEXT: v_add_f16_e32 v37, s20, v1 -; VI-NEXT: v_add_f16_e32 v15, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_add_f16_e32 v15, s14, v1 +; VI-NEXT: v_add_f16_e32 v30, s13, v1 +; VI-NEXT: s_lshr_b32 s13, s12, 16 ; VI-NEXT: v_or_b32_e32 v7, v37, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_add_f16_e32 v30, s23, v1 -; VI-NEXT: v_add_f16_e32 v22, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: v_or_b32_e32 v47, v30, v2 +; VI-NEXT: v_add_f16_e32 v22, s13, v1 +; VI-NEXT: v_add_f16_e32 v38, s12, v1 +; VI-NEXT: s_lshr_b32 s12, s11, 16 +; VI-NEXT: v_or_b32_e32 v57, v30, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; VI-NEXT: v_add_f16_e32 v38, s22, v1 -; VI-NEXT: v_add_f16_e32 v16, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s24, 16 -; VI-NEXT: v_or_b32_e32 v46, v38, v2 +; VI-NEXT: v_add_f16_e32 v16, s12, v1 +; VI-NEXT: v_add_f16_e32 v31, s11, v1 +; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: v_or_b32_e32 v56, v38, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; VI-NEXT: v_add_f16_e32 v31, s25, v1 -; VI-NEXT: v_add_f16_e32 v23, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_add_f16_e32 v23, s11, v1 +; VI-NEXT: v_add_f16_e32 v39, s10, v1 +; VI-NEXT: s_lshr_b32 s10, s9, 16 ; VI-NEXT: v_or_b32_e32 v6, v31, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; VI-NEXT: v_add_f16_e32 v39, s24, v1 -; VI-NEXT: v_add_f16_e32 v17, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_add_f16_e32 v17, s10, v1 +; VI-NEXT: v_add_f16_e32 v32, s9, v1 +; VI-NEXT: s_lshr_b32 s9, s8, 16 ; VI-NEXT: v_or_b32_e32 v5, v39, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; VI-NEXT: v_add_f16_e32 v32, s27, v1 -; VI-NEXT: v_add_f16_e32 v24, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_add_f16_e32 v24, s9, v1 +; VI-NEXT: v_add_f16_e32 v48, s8, v1 +; VI-NEXT: s_lshr_b32 s8, s7, 16 ; VI-NEXT: v_or_b32_e32 v43, v32, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; VI-NEXT: v_add_f16_e32 v48, s26, v1 -; VI-NEXT: v_add_f16_e32 v18, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_add_f16_e32 v18, s8, v1 +; VI-NEXT: v_add_f16_e32 v33, s7, v1 +; VI-NEXT: s_lshr_b32 s7, s6, 16 ; VI-NEXT: v_or_b32_e32 v42, v48, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_add_f16_e32 v33, s29, v1 -; VI-NEXT: v_add_f16_e32 v25, s6, v1 +; VI-NEXT: v_add_f16_e32 v25, s7, v1 +; VI-NEXT: v_add_f16_e32 v49, s6, v1 ; VI-NEXT: s_lshr_b32 s6, s5, 16 ; VI-NEXT: v_or_b32_e32 v55, v33, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; VI-NEXT: v_add_f16_e32 v49, s28, v1 ; VI-NEXT: v_add_f16_e32 v11, s6, v1 ; VI-NEXT: v_add_f16_e32 v34, s5, v1 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_or_b32_e32 v54, v49, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 ; VI-NEXT: v_add_f16_e32 v26, s5, v1 -; VI-NEXT: v_or_b32_e32 v52, v34, v2 +; VI-NEXT: v_or_b32_e32 v41, v34, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; VI-NEXT: v_add_f16_e32 v50, s4, v1 -; VI-NEXT: v_or_b32_e32 v51, v50, v2 -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[51:52] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v40, v50, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[40:41] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[54:55] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] ; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 ; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42 ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[56:57] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[61:62] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v56 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[51:52] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10] -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v55 ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v46 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v61 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v57 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; VI-NEXT: v_bfe_u32 v9, v11, 8, 8 ; VI-NEXT: v_bfe_u32 v10, v18, 8, 8 ; VI-NEXT: v_bfe_u32 v40, v17, 8, 8 ; VI-NEXT: v_bfe_u32 v43, v16, 8, 8 ; VI-NEXT: v_bfe_u32 v46, v15, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v59, v13, 8, 8 +; VI-NEXT: v_bfe_u32 v60, v13, 8, 8 ; VI-NEXT: v_bfe_u32 v62, v12, 8, 8 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr31 -; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v53, s56 -; VI-NEXT: v_mov_b32_e32 v52, s42 -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s56 ; VI-NEXT: v_mov_b32_e32 v52, s44 -; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 @@ -79148,96 +79914,96 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v18, s38 ; VI-NEXT: v_mov_b32_e32 v26, s37 ; VI-NEXT: v_mov_b32_e32 v11, s36 -; VI-NEXT: v_mov_b32_e32 v35, s16 -; VI-NEXT: v_mov_b32_e32 v27, s17 -; VI-NEXT: v_mov_b32_e32 v36, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 -; VI-NEXT: v_mov_b32_e32 v37, s20 -; VI-NEXT: v_mov_b32_e32 v29, s21 -; VI-NEXT: v_mov_b32_e32 v38, s22 -; VI-NEXT: v_mov_b32_e32 v30, s23 -; VI-NEXT: v_mov_b32_e32 v39, s24 -; VI-NEXT: v_mov_b32_e32 v31, s25 -; VI-NEXT: v_mov_b32_e32 v48, s26 -; VI-NEXT: v_mov_b32_e32 v32, s27 -; VI-NEXT: v_mov_b32_e32 v49, s28 -; VI-NEXT: v_mov_b32_e32 v33, s29 +; VI-NEXT: v_mov_b32_e32 v35, s18 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v36, s16 +; VI-NEXT: v_mov_b32_e32 v28, s17 +; VI-NEXT: v_mov_b32_e32 v37, s14 +; VI-NEXT: v_mov_b32_e32 v29, s15 +; VI-NEXT: v_mov_b32_e32 v38, s12 +; VI-NEXT: v_mov_b32_e32 v30, s13 +; VI-NEXT: v_mov_b32_e32 v39, s10 +; VI-NEXT: v_mov_b32_e32 v31, s11 +; VI-NEXT: v_mov_b32_e32 v48, s8 +; VI-NEXT: v_mov_b32_e32 v32, s9 +; VI-NEXT: v_mov_b32_e32 v49, s6 +; VI-NEXT: v_mov_b32_e32 v33, s7 ; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_mov_b32_e32 v34, s5 ; VI-NEXT: v_mov_b32_e32 v62, s35 -; VI-NEXT: v_mov_b32_e32 v59, s34 +; VI-NEXT: v_mov_b32_e32 v60, s34 ; VI-NEXT: v_mov_b32_e32 v57, s31 -; VI-NEXT: v_mov_b32_e32 v46, s91 -; VI-NEXT: v_mov_b32_e32 v43, s89 -; VI-NEXT: v_mov_b32_e32 v40, s79 -; VI-NEXT: v_mov_b32_e32 v10, s77 -; VI-NEXT: v_mov_b32_e32 v61, s30 -; VI-NEXT: v_mov_b32_e32 v58, s90 -; VI-NEXT: v_mov_b32_e32 v47, s88 -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v42, s76 -; VI-NEXT: v_mov_b32_e32 v55, s74 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v54, s57 -; VI-NEXT: v_mov_b32_e32 v41, s59 -; VI-NEXT: v_mov_b32_e32 v44, s60 -; VI-NEXT: v_mov_b32_e32 v56, s61 -; VI-NEXT: v_mov_b32_e32 v60, s63 -; VI-NEXT: v_mov_b32_e32 v51, s72 -; VI-NEXT: v_mov_b32_e32 v1, s73 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: v_mov_b32_e32 v7, s8 -; VI-NEXT: v_mov_b32_e32 v6, s10 -; VI-NEXT: v_mov_b32_e32 v5, s12 -; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v46, s90 +; VI-NEXT: v_mov_b32_e32 v43, s79 +; VI-NEXT: v_mov_b32_e32 v40, s77 +; VI-NEXT: v_mov_b32_e32 v10, s75 +; VI-NEXT: v_mov_b32_e32 v9, s72 +; VI-NEXT: v_mov_b32_e32 v51, s30 +; VI-NEXT: v_mov_b32_e32 v1, s91 +; VI-NEXT: v_mov_b32_e32 v61, s89 +; VI-NEXT: v_mov_b32_e32 v59, s88 +; VI-NEXT: v_mov_b32_e32 v58, s78 +; VI-NEXT: v_mov_b32_e32 v56, s76 +; VI-NEXT: v_mov_b32_e32 v47, s74 +; VI-NEXT: v_mov_b32_e32 v45, s73 +; VI-NEXT: v_mov_b32_e32 v44, s63 +; VI-NEXT: v_mov_b32_e32 v42, s62 +; VI-NEXT: v_mov_b32_e32 v41, s61 +; VI-NEXT: v_mov_b32_e32 v55, s60 +; VI-NEXT: v_mov_b32_e32 v54, s59 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v6, s24 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v4, s28 ; VI-NEXT: v_mov_b32_e32 v3, s40 -; VI-NEXT: v_mov_b32_e32 v9, s75 -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v52, s62 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: .LBB105_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v51 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v8, v35, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v60 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79279,25 +80045,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -79318,30 +80082,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 @@ -79350,8 +80100,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -79377,10 +80142,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -79404,66 +80197,66 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 ; GFX9-NEXT: .LBB105_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 -; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, s22, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, s25, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, s24, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, s27, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, s26, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, s29, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s15, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s14, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s13, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s12, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s11, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s10, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s9, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s8, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s7, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s6, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] @@ -79524,31 +80317,31 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: .LBB105_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -79577,20 +80370,20 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -79633,15 +80426,15 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB105_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79719,21 +80512,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -79750,7 +80528,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79776,6 +80554,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -85177,24 +85970,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -85222,6 +85999,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -85777,22 +86570,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -85826,6 +86603,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -86357,10 +87150,21 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -86377,17 +87181,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -87519,1348 +88312,1564 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 -; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16 -; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v21 +; SI-NEXT: v_readfirstlane_b32 s40, v23 +; SI-NEXT: v_readfirstlane_b32 s18, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s25, s4, 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s44, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s45, s4, 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s58, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 -; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: v_writelane_b32 v41, s78, 0 +; SI-NEXT: v_writelane_b32 v41, s79, 1 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v23, s5, v23 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s4, s4, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_lshl_b32 s4, s16, 8 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30 -; SI-NEXT: v_or_b32_e32 v20, s4, v20 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s8, 24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: s_and_b32 s7, s74, 0xff +; SI-NEXT: s_lshl_b32 s13, s92, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s88, 0xff +; SI-NEXT: v_readlane_b32 s74, v41, 0 +; SI-NEXT: s_lshl_b32 s21, s74, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s13, s24, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s73, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_or_b32_e32 v2, s13, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s7, v2 +; SI-NEXT: s_and_b32 s7, s60, 0xff +; SI-NEXT: s_lshl_b32 s13, s30, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s94, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s90, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_lshl_b32 s4, s22, 8 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_or_b32_e32 v17, s4, v17 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s14, 24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s62, 0xff +; SI-NEXT: s_lshl_b32 s13, s23, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s46, 0xff +; SI-NEXT: s_lshl_b32 s13, s38, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s34, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s13, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_lshl_b32 s4, s28, 8 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s20, 24 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s52, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s18, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_lshl_b32 s4, s44, 8 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s26, 24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s66, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s64, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s16, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s13, s15, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s4, s58, 8 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s42, 24 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s41, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s78, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s70, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s15, s68, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_lshl_b32 s4, s72, 8 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s56, 24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s10, s98, 8 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s12, s10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_lshl_b32 s4, s76, 8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s62, 24 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s11, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s84, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s82, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s80, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s75, v41, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v63, s30, 0 -; VI-NEXT: v_writelane_b32 v63, s31, 1 -; VI-NEXT: v_writelane_b32 v63, s34, 2 -; VI-NEXT: v_writelane_b32 v63, s35, 3 -; VI-NEXT: v_writelane_b32 v63, s36, 4 -; VI-NEXT: v_writelane_b32 v63, s37, 5 -; VI-NEXT: v_writelane_b32 v63, s38, 6 -; VI-NEXT: v_writelane_b32 v63, s39, 7 -; VI-NEXT: v_writelane_b32 v63, s48, 8 -; VI-NEXT: v_writelane_b32 v63, s49, 9 -; VI-NEXT: v_writelane_b32 v63, s50, 10 -; VI-NEXT: v_writelane_b32 v63, s51, 11 -; VI-NEXT: v_writelane_b32 v63, s52, 12 -; VI-NEXT: v_writelane_b32 v63, s53, 13 -; VI-NEXT: v_writelane_b32 v63, s54, 14 -; VI-NEXT: v_writelane_b32 v63, s55, 15 -; VI-NEXT: v_writelane_b32 v63, s64, 16 -; VI-NEXT: v_writelane_b32 v63, s65, 17 -; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s68, 20 +; VI-NEXT: v_writelane_b32 v18, s69, 21 +; VI-NEXT: v_writelane_b32 v18, s70, 22 +; VI-NEXT: v_writelane_b32 v18, s71, 23 +; VI-NEXT: v_writelane_b32 v18, s80, 24 +; VI-NEXT: v_writelane_b32 v18, s81, 25 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: v_writelane_b32 v18, s83, 27 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s56, s5, 24 -; VI-NEXT: s_lshr_b32 s57, s5, 16 -; VI-NEXT: s_lshr_b32 s59, s5, 8 -; VI-NEXT: s_lshr_b32 s58, s4, 16 -; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 -; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: s_lshr_b32 s21, s5, 24 +; VI-NEXT: s_lshr_b32 s23, s5, 16 +; VI-NEXT: s_lshr_b32 s25, s5, 8 +; VI-NEXT: s_lshr_b32 s27, s4, 16 +; VI-NEXT: s_lshr_b32 s29, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s7, 24 +; VI-NEXT: s_lshr_b32 s47, s7, 16 +; VI-NEXT: s_lshr_b32 s57, s7, 8 +; VI-NEXT: s_lshr_b32 s88, s6, 16 +; VI-NEXT: s_lshr_b32 s89, s6, 8 +; VI-NEXT: s_lshr_b32 s90, s9, 24 +; VI-NEXT: s_lshr_b32 s91, s9, 16 +; VI-NEXT: s_lshr_b32 s30, s9, 8 +; VI-NEXT: s_lshr_b32 s31, s8, 16 +; VI-NEXT: s_lshr_b32 s34, s8, 8 +; VI-NEXT: s_lshr_b32 s35, s11, 24 +; VI-NEXT: s_lshr_b32 s36, s11, 16 +; VI-NEXT: s_lshr_b32 s37, s11, 8 +; VI-NEXT: s_lshr_b32 s38, s10, 16 +; VI-NEXT: s_lshr_b32 s39, s10, 8 +; VI-NEXT: s_lshr_b32 s48, s13, 24 +; VI-NEXT: s_lshr_b32 s49, s13, 16 +; VI-NEXT: s_lshr_b32 s50, s13, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s53, s15, 24 +; VI-NEXT: s_lshr_b32 s54, s15, 16 +; VI-NEXT: s_lshr_b32 s55, s15, 8 +; VI-NEXT: s_lshr_b32 s64, s14, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 8 +; VI-NEXT: s_lshr_b32 s66, s17, 24 +; VI-NEXT: s_lshr_b32 s67, s17, 16 +; VI-NEXT: s_lshr_b32 s68, s17, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s71, s19, 24 +; VI-NEXT: s_lshr_b32 s80, s19, 16 +; VI-NEXT: s_lshr_b32 s81, s19, 8 +; VI-NEXT: s_lshr_b32 s82, s18, 16 +; VI-NEXT: s_lshr_b32 s83, s18, 8 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_mov_b32 s20, s19 +; VI-NEXT: s_mov_b32 s22, s17 +; VI-NEXT: s_mov_b32 s24, s15 +; VI-NEXT: s_mov_b32 s26, s13 +; VI-NEXT: s_mov_b32 s28, s11 +; VI-NEXT: s_mov_b32 s40, s9 +; VI-NEXT: s_mov_b32 s46, s7 +; VI-NEXT: s_mov_b32 s56, s5 +; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: s_lshl_b32 s20, s19, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s20, v1 +; VI-NEXT: v_readfirstlane_b32 s20, v2 +; VI-NEXT: s_bfe_u32 s21, s20, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s20 +; VI-NEXT: s_add_i32 s22, s21, 0x7fff +; VI-NEXT: s_or_b32 s23, s20, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b64 s[20:21], vcc, exec +; VI-NEXT: s_cselect_b32 s20, s23, s22 +; VI-NEXT: s_and_b32 s19, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s19 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s19, s19, s21 +; VI-NEXT: s_lshr_b32 s21, s19, 16 +; VI-NEXT: s_lshl_b32 s19, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: s_lshr_b64 s[20:21], s[20:21], 16 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s19 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s22, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s18, v1 +; VI-NEXT: v_readfirstlane_b32 s18, v2 +; VI-NEXT: s_bfe_u32 s19, s18, 0x10010 +; VI-NEXT: s_add_i32 s19, s19, s18 +; VI-NEXT: s_add_i32 s21, s19, 0x7fff +; VI-NEXT: s_or_b32 s23, s18, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[18:19], vcc, exec +; VI-NEXT: s_cselect_b32 s18, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_lshr_b64 s[18:19], s[22:23], 16 +; VI-NEXT: s_bfe_u32 s22, s21, 0x10010 +; VI-NEXT: s_add_i32 s22, s22, s21 +; VI-NEXT: s_add_i32 s24, s22, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s22, s21, s24 +; VI-NEXT: s_and_b32 s17, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s21, s17, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s17 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s17, s17, s21 +; VI-NEXT: s_lshr_b32 s23, s17, 16 +; VI-NEXT: s_lshl_b32 s17, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s21, s17, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s17 +; VI-NEXT: s_lshr_b64 s[22:23], s[22:23], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s24, s17, s21 +; VI-NEXT: s_and_b32 s16, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s16, v1 +; VI-NEXT: v_readfirstlane_b32 s16, v2 +; VI-NEXT: s_bfe_u32 s17, s16, 0x10010 +; VI-NEXT: s_add_i32 s17, s17, s16 +; VI-NEXT: s_add_i32 s21, s17, 0x7fff +; VI-NEXT: s_or_b32 s23, s16, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[16:17], vcc, exec +; VI-NEXT: s_cselect_b32 s16, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s15, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s25, s16, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[16:17], s[24:25], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s24, s21, s23 +; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s21, s15, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s15 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s15, s15, s21 +; VI-NEXT: s_lshr_b32 s25, s15, 16 +; VI-NEXT: s_lshl_b32 s15, s14, 16 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s21, s15, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s15 +; VI-NEXT: s_lshr_b64 s[24:25], s[24:25], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s15, s21 +; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_add_i32 s21, s15, 0x7fff +; VI-NEXT: s_or_b32 s23, s14, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s13, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s27, s14, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s21, s23 +; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s21, s13, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s13 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s13, s13, s21 +; VI-NEXT: s_lshr_b32 s27, s13, 16 +; VI-NEXT: s_lshl_b32 s13, s12, 16 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s21, s13, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s13 +; VI-NEXT: s_lshr_b64 s[26:27], s[26:27], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s13, s21 +; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_add_i32 s21, s13, 0x7fff +; VI-NEXT: s_or_b32 s23, s12, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s11, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s29, s12, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s21, s23 +; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s21, s11, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s11 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s11, s11, s21 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshl_b32 s11, s10, 16 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s21, s11, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s11 +; VI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s11, s21 +; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_add_i32 s21, s11, 0x7fff +; VI-NEXT: s_or_b32 s23, s10, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s9, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s41, s10, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s21, s23 +; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s21, s9, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s9 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s9, s9, s21 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshl_b32 s9, s8, 16 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s21, s9, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s9 +; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s9, s21 +; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_add_i32 s21, s9, 0x7fff +; VI-NEXT: s_or_b32 s23, s8, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s7, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s43, s8, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[8:9], s[42:43], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s21, s23 +; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s21, s7, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s7 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s21 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s6, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s21, s7, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s7 +; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s21 +; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s21, s7, 0x7fff +; VI-NEXT: s_or_b32 s23, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[6:7], s[42:43], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s21, s23 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s5 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s21 +; VI-NEXT: s_lshr_b32 s43, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s5 +; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s5, s21 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v15, s4, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: s_branch .LBB109_5 -; VI-NEXT: .LBB109_3: +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: s_add_i32 s21, s5, 0x7fff +; VI-NEXT: s_or_b32 s23, s4, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cselect_b32 s4, s23, s21 +; VI-NEXT: s_lshr_b32 s43, s4, 16 +; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; VI-NEXT: s_mov_b32 s19, s20 +; VI-NEXT: s_mov_b32 s17, s22 +; VI-NEXT: s_mov_b32 s15, s24 +; VI-NEXT: s_mov_b32 s13, s26 +; VI-NEXT: s_mov_b32 s11, s28 +; VI-NEXT: s_mov_b32 s9, s40 +; VI-NEXT: s_mov_b32 s7, s46 +; VI-NEXT: s_mov_b32 s5, s56 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 +; VI-NEXT: s_lshr_b32 s21, s56, 24 +; VI-NEXT: s_lshr_b32 s23, s56, 16 +; VI-NEXT: s_lshr_b32 s25, s56, 8 +; VI-NEXT: s_lshr_b32 s27, s4, 16 +; VI-NEXT: s_lshr_b32 s29, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s46, 24 +; VI-NEXT: s_lshr_b32 s47, s46, 16 +; VI-NEXT: s_lshr_b32 s57, s46, 8 +; VI-NEXT: s_lshr_b32 s88, s6, 16 +; VI-NEXT: s_lshr_b32 s89, s6, 8 +; VI-NEXT: s_lshr_b32 s90, s40, 24 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s40, 8 +; VI-NEXT: s_lshr_b32 s31, s8, 16 +; VI-NEXT: s_lshr_b32 s34, s8, 8 +; VI-NEXT: s_lshr_b32 s35, s28, 24 +; VI-NEXT: s_lshr_b32 s36, s28, 16 +; VI-NEXT: s_lshr_b32 s37, s28, 8 +; VI-NEXT: s_lshr_b32 s38, s10, 16 +; VI-NEXT: s_lshr_b32 s39, s10, 8 +; VI-NEXT: s_lshr_b32 s48, s26, 24 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s50, s26, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s53, s24, 24 +; VI-NEXT: s_lshr_b32 s54, s24, 16 +; VI-NEXT: s_lshr_b32 s55, s24, 8 +; VI-NEXT: s_lshr_b32 s64, s14, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 8 +; VI-NEXT: s_lshr_b32 s66, s22, 24 +; VI-NEXT: s_lshr_b32 s67, s22, 16 +; VI-NEXT: s_lshr_b32 s68, s22, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s71, s20, 24 +; VI-NEXT: s_lshr_b32 s80, s20, 16 +; VI-NEXT: s_lshr_b32 s81, s20, 8 +; VI-NEXT: s_lshr_b32 s82, s18, 16 +; VI-NEXT: s_lshr_b32 s83, s18, 8 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s83, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_and_b32 s9, s82, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s81, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s80, 0xff +; VI-NEXT: s_lshl_b32 s9, s71, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s70, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s69, 0xff +; VI-NEXT: s_lshl_b32 s9, s74, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s68, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s67, 0xff +; VI-NEXT: s_lshl_b32 s9, s66, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s7, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s64, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s55, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s54, 0xff +; VI-NEXT: s_lshl_b32 s9, s53, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s7, s52, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s51, 0xff +; VI-NEXT: s_lshl_b32 s9, s62, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s7, s50, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s49, 0xff +; VI-NEXT: s_lshl_b32 s9, s48, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s7, s39, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s38, 0xff +; VI-NEXT: s_lshl_b32 s9, s60, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s7, s37, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s36, 0xff +; VI-NEXT: s_lshl_b32 s9, s35, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s7, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s31, 0xff +; VI-NEXT: s_lshl_b32 s8, s58, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s7, s30, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s90, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s7, s44, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s57, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s56, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s83, v18, 27 +; VI-NEXT: v_readlane_b32 s82, v18, 26 +; VI-NEXT: v_readlane_b32 s81, v18, 25 +; VI-NEXT: v_readlane_b32 s80, v18, 24 +; VI-NEXT: v_readlane_b32 s71, v18, 23 +; VI-NEXT: v_readlane_b32 s70, v18, 22 +; VI-NEXT: v_readlane_b32 s69, v18, 21 +; VI-NEXT: v_readlane_b32 s68, v18, 20 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr68 ; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 -; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 ; VI-NEXT: s_branch .LBB109_2 -; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s42 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 -; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 -; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v18, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 -; VI-NEXT: v_mov_b32_e32 v57, s53 -; VI-NEXT: v_mov_b32_e32 v47, s51 -; VI-NEXT: v_mov_b32_e32 v56, s50 -; VI-NEXT: v_mov_b32_e32 v46, s49 -; VI-NEXT: v_mov_b32_e32 v45, s39 -; VI-NEXT: v_mov_b32_e32 v44, s48 -; VI-NEXT: v_mov_b32_e32 v42, s38 -; VI-NEXT: v_mov_b32_e32 v43, s37 -; VI-NEXT: v_mov_b32_e32 v41, s36 -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_mov_b32_e32 v55, s35 -; VI-NEXT: v_mov_b32_e32 v53, s31 -; VI-NEXT: v_mov_b32_e32 v54, s30 -; VI-NEXT: v_mov_b32_e32 v52, s91 -; VI-NEXT: v_mov_b32_e32 v51, s89 -; VI-NEXT: v_mov_b32_e32 v50, s90 -; VI-NEXT: v_mov_b32_e32 v48, s88 -; VI-NEXT: v_mov_b32_e32 v49, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v38, s76 -; VI-NEXT: v_mov_b32_e32 v37, s77 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v34, s73 -; VI-NEXT: v_mov_b32_e32 v33, s63 -; VI-NEXT: v_mov_b32_e32 v32, s72 -; VI-NEXT: v_mov_b32_e32 v30, s62 -; VI-NEXT: v_mov_b32_e32 v31, s61 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_mov_b32_e32 v28, s58 -; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v25, s57 -; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s92, s5, 24 @@ -88868,425 +89877,425 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s93, s5, 8 ; GFX9-NEXT: s_lshr_b32 s94, s4, 16 ; GFX9-NEXT: s_lshr_b32 s95, s4, 8 -; GFX9-NEXT: s_lshr_b32 s30, s29, 24 -; GFX9-NEXT: s_lshr_b32 s90, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s29, 8 -; GFX9-NEXT: s_lshr_b32 s31, s28, 16 -; GFX9-NEXT: s_lshr_b32 s74, s28, 8 -; GFX9-NEXT: s_lshr_b32 s34, s27, 24 -; GFX9-NEXT: s_lshr_b32 s89, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s27, 8 -; GFX9-NEXT: s_lshr_b32 s35, s26, 16 -; GFX9-NEXT: s_lshr_b32 s72, s26, 8 -; GFX9-NEXT: s_lshr_b32 s36, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s25, 8 -; GFX9-NEXT: s_lshr_b32 s37, s24, 16 -; GFX9-NEXT: s_lshr_b32 s62, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s79, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s23, 8 -; GFX9-NEXT: s_lshr_b32 s39, s22, 16 -; GFX9-NEXT: s_lshr_b32 s60, s22, 8 -; GFX9-NEXT: s_lshr_b32 s48, s21, 24 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s21, 8 -; GFX9-NEXT: s_lshr_b32 s49, s20, 16 -; GFX9-NEXT: s_lshr_b32 s58, s20, 8 -; GFX9-NEXT: s_lshr_b32 s50, s19, 24 -; GFX9-NEXT: s_lshr_b32 s77, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s19, 8 -; GFX9-NEXT: s_lshr_b32 s51, s18, 16 -; GFX9-NEXT: s_lshr_b32 s56, s18, 8 -; GFX9-NEXT: s_lshr_b32 s52, s17, 24 -; GFX9-NEXT: s_lshr_b32 s76, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s30, s7, 24 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s7, 8 +; GFX9-NEXT: s_lshr_b32 s31, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s6, 8 +; GFX9-NEXT: s_lshr_b32 s34, s9, 24 +; GFX9-NEXT: s_lshr_b32 s89, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s9, 8 +; GFX9-NEXT: s_lshr_b32 s35, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s8, 8 +; GFX9-NEXT: s_lshr_b32 s36, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s11, 8 +; GFX9-NEXT: s_lshr_b32 s37, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s10, 8 +; GFX9-NEXT: s_lshr_b32 s38, s13, 24 +; GFX9-NEXT: s_lshr_b32 s79, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s13, 8 +; GFX9-NEXT: s_lshr_b32 s39, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s12, 8 +; GFX9-NEXT: s_lshr_b32 s48, s15, 24 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s15, 8 +; GFX9-NEXT: s_lshr_b32 s49, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s14, 8 +; GFX9-NEXT: s_lshr_b32 s50, s17, 24 +; GFX9-NEXT: s_lshr_b32 s77, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s51, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s16, 8 +; GFX9-NEXT: s_lshr_b32 s52, s19, 24 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB109_3 ; GFX9-NEXT: .LBB109_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: s_and_b32 s20, s19, 0xffff0000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s76, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s17, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s19, v1 +; GFX9-NEXT: v_readfirstlane_b32 s19, v2 +; GFX9-NEXT: s_lshr_b32 s76, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s19, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s19 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s17, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s19, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s19, s19, s22 +; GFX9-NEXT: s_and_b32 s20, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s19, s19, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s16, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s18, v1 +; GFX9-NEXT: v_readfirstlane_b32 s18, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s18, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s18 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s16, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s18, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s18, s18, s23 +; GFX9-NEXT: s_and_b32 s20, s17, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s18, s18, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s18, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s19, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s17, v1 +; GFX9-NEXT: v_readfirstlane_b32 s17, v2 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s17, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s17 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s19, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s17, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s17, s17, s22 +; GFX9-NEXT: s_and_b32 s20, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s17, s17, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s18, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s16, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s16 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s18, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s16, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s16, s16, s23 +; GFX9-NEXT: s_and_b32 s20, s15, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s16, s16, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s16, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s78, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s21, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s15, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_lshr_b32 s78, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s15, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s15 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s21, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s15, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s15, s15, s22 +; GFX9-NEXT: s_and_b32 s20, s14, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s15, s15, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s20, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s14, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s14 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s20, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s14, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s14, s14, s23 +; GFX9-NEXT: s_and_b32 s20, s13, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s14, s14, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s14, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s79, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s23, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s13, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s13, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s13 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s23, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s13, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s13, s13, s22 +; GFX9-NEXT: s_and_b32 s20, s12, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s13, s13, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s22, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s12, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s12 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s22, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s12, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s12, s12, s23 +; GFX9-NEXT: s_and_b32 s20, s11, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s12, s12, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s12, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s88, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s25, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s11, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v2 +; GFX9-NEXT: s_lshr_b32 s88, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s11, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s11 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s25, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s11, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s11, s11, s22 +; GFX9-NEXT: s_and_b32 s20, s10, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s11, s11, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s24, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s10, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s10 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s24, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s10, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s10, s10, s23 +; GFX9-NEXT: s_and_b32 s20, s9, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s10, s10, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s10, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s89, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s27, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s9, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_lshr_b32 s89, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s9, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s9 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s27, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s9, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s9, s9, s22 +; GFX9-NEXT: s_and_b32 s20, s8, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s9, s9, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s26, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s8 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s8, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s8, s23 +; GFX9-NEXT: s_and_b32 s20, s7, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s8, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s90, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s29, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: s_lshr_b32 s90, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s7, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s7 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s29, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s7, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s7, s7, s22 +; GFX9-NEXT: s_and_b32 s20, s6, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s6 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s28, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bitset1_b32 s6, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s23 +; GFX9-NEXT: s_and_b32 s20, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s6, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_lshr_b32 s91, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s5 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s22 +; GFX9-NEXT: s_and_b32 s20, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s4 -; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s4 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 +; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s23 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s17, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s13, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s11, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s9, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s7, s90 ; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s22 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[30:31], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[60:61], 24 ; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 ; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 @@ -89323,165 +90332,165 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s54, s46, 16 ; GFX9-NEXT: s_lshr_b32 s55, s46, 8 ; GFX9-NEXT: .LBB109_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s76, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s52, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s52, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s56, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s51, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s56, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s51, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s57, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s50, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s57, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s58, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s58, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s59, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s78, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s48, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s59, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s48, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s60, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s79, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s61, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s38, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s62, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s37, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s62, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s36, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s63, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s36, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s72, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s35, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s72, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s89, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s34, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s74, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s31, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s75, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s74, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s30, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s95, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -89499,24 +90508,24 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -89541,31 +90550,31 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr37 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr31 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr92 |
