diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/bf16.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/bf16.ll | 841 |
1 files changed, 538 insertions, 303 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 2bdf994..cd6d741 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -9082,17 +9082,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fadd_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13318,9 +13320,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_0: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13413,9 +13416,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_1: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13515,17 +13519,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fsub_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14275,17 +14281,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fmul_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18568,32 +18576,34 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fdiv_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11TRUE16-NEXT: v_div_scale_f32 v1, null, v0, v0, v2 +; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v2, v0, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v1 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11TRUE16-NEXT: v_fma_f32 v6, -v1, v4, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fma_f32 v6, -v2, v4, v5 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11TRUE16-NEXT: v_fma_f32 v1, -v1, v4, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11TRUE16-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11TRUE16-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v1, v0, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19018,17 +19028,19 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_minnum_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23270,17 +23282,19 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_maxnum_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -27591,11 +27605,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_sqrt_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -27730,9 +27745,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11TRUE16-LABEL: v_ldexp_bf16_i32: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v2, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -27836,17 +27852,18 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX11TRUE16-LABEL: v_frexp_bf16_i16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -28019,11 +28036,12 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -28177,13 +28195,14 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log2_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -28367,11 +28386,12 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log10_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -28580,25 +28600,26 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1 -; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x3fb8aa3b, v1, -v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x32a5705f, v2 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -28744,13 +28765,14 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp2_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 @@ -28937,25 +28959,26 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp10_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1 -; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x40549a78, v1 +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x40549a78, v1, -v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x33979a37, v2 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -29066,9 +29089,10 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_ceil_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29163,9 +29187,10 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_trunc_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29260,9 +29285,10 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_rint_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29357,9 +29383,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_nearbyint_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29483,16 +29510,17 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_round_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_trunc_f32_e32 v1, v0 -; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v2, v1 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -29594,9 +29622,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_roundeven_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29691,9 +29720,10 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_floor_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29786,9 +29816,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_canonicalize_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29916,15 +29947,27 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_oeq_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_oeq_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_oeq_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp oeq bfloat %a, %b ret i1 %op } @@ -29979,15 +30022,27 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ogt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ogt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ogt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ogt bfloat %a, %b ret i1 %op } @@ -30042,15 +30097,27 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_oge_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_oge_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_oge_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp oge bfloat %a, %b ret i1 %op } @@ -30105,15 +30172,27 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_olt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_olt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_olt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp olt bfloat %a, %b ret i1 %op } @@ -30168,15 +30247,27 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ole_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ole_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ole_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ole bfloat %a, %b ret i1 %op } @@ -30231,15 +30322,27 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_one_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_one_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_one_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp one bfloat %a, %b ret i1 %op } @@ -30294,15 +30397,27 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_uno_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_uno_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_uno_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp uno bfloat %a, %b ret i1 %op } @@ -30357,15 +30472,27 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ueq_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ueq_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ueq_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ueq bfloat %a, %b ret i1 %op } @@ -30420,15 +30547,27 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ugt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ugt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ugt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ugt bfloat %a, %b ret i1 %op } @@ -30483,15 +30622,27 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_uge_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_uge_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_uge_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp uge bfloat %a, %b ret i1 %op } @@ -30546,15 +30697,27 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ult_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ult_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ult_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ult bfloat %a, %b ret i1 %op } @@ -30609,15 +30772,27 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ule_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ule_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ule_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ule bfloat %a, %b ret i1 %op } @@ -30672,15 +30847,27 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_une_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_une_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_une_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp une bfloat %a, %b ret i1 %op } @@ -30763,13 +30950,22 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i16 ret i16 %op } @@ -31144,13 +31340,22 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i32: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i32: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i32 ret i32 %op } @@ -31494,27 +31699,50 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_floor_f32_e32 v1, v1 -; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2 -; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i64: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_floor_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v2 +; GFX11TRUE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i64: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_floor_f32_e32 v1, v1 +; GFX11FAKE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v2 +; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX11FAKE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i64 ret i64 %op } @@ -42575,18 +42803,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11TRUE16-LABEL: v_fma_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX11TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -43457,26 +43688,30 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11TRUE16-LABEL: v_fmuladd_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; |