diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 888 |
1 files changed, 545 insertions, 343 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 632b03c..73b57a5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1482,46 +1482,87 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 -; GFX11-NEXT: .LBB15_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB15_3: -; GFX11-NEXT: s_branch .LBB15_2 -; GFX11-NEXT: .LBB15_4: -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_3: +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_3: +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2279,17 +2320,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2301,13 +2338,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3728,46 +3761,87 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 -; GFX11-NEXT: .LBB35_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB35_3: -; GFX11-NEXT: s_branch .LBB35_2 -; GFX11-NEXT: .LBB35_4: -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_3: +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_3: +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4530,17 +4604,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4552,13 +4622,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5440,27 +5506,24 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -5608,44 +5671,81 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 -; GFX11-NEXT: .LBB51_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB51_3: -; GFX11-NEXT: s_branch .LBB51_2 -; GFX11-NEXT: .LBB51_4: -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_3: +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_3: +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6487,17 +6587,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6509,13 +6605,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7247,46 +7339,87 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 -; GFX11-NEXT: .LBB63_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB63_3: -; GFX11-NEXT: s_branch .LBB63_2 -; GFX11-NEXT: .LBB63_4: -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB63_3: +; GFX11-TRUE16-NEXT: s_branch .LBB63_2 +; GFX11-TRUE16-NEXT: .LBB63_4: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB63_3: +; GFX11-FAKE16-NEXT: s_branch .LBB63_2 +; GFX11-FAKE16-NEXT: .LBB63_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8138,17 +8271,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8160,13 +8289,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8685,46 +8810,87 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 -; GFX11-NEXT: ; %bb.1: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 -; GFX11-NEXT: .LBB73_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB73_3: -; GFX11-NEXT: s_branch .LBB73_2 -; GFX11-NEXT: .LBB73_4: -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB73_3: +; GFX11-TRUE16-NEXT: s_branch .LBB73_2 +; GFX11-TRUE16-NEXT: .LBB73_4: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB73_3: +; GFX11-FAKE16-NEXT: s_branch .LBB73_2 +; GFX11-FAKE16-NEXT: .LBB73_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9290,57 +9456,109 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2bf16_to_v4i8_scalar: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s2, s0, 24 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 -; GFX11-NEXT: .LBB77_2: ; %cmp.true -; GFX11-NEXT: s_lshl_b32 s1, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB77_3: -; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr2 -; GFX11-NEXT: s_branch .LBB77_2 -; GFX11-NEXT: .LBB77_4: -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB77_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-TRUE16-NEXT: s_branch .LBB77_2 +; GFX11-TRUE16-NEXT: .LBB77_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB77_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: s_branch .LBB77_2 +; GFX11-FAKE16-NEXT: .LBB77_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9502,17 +9720,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9524,13 +9738,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10212,17 +10422,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10234,13 +10440,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; |