diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll | 532 |
1 files changed, 228 insertions, 304 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 2f08931..8784352 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1872,63 +1872,48 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX11-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: @@ -1985,74 +1970,59 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: @@ -2505,114 +2475,88 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: @@ -2710,123 +2654,103 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: |