diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 216 |
1 files changed, 114 insertions, 102 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index bfed1f4..48c9b87 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -2273,18 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2294,16 +2295,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4504,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -4520,18 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4541,16 +4544,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6463,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -6479,18 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6500,16 +6505,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8110,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -8126,18 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8147,16 +8154,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9471,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -9487,18 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9508,16 +9517,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10183,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -10199,18 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10220,16 +10231,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; |