diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 1421 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmax_legacy.ll | 226 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll | 281 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 1421 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmin_legacy.ll | 302 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fneg-combines.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 29 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll | 46 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll | 341 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/v_mac.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/v_mac_f16.ll | 14 |
12 files changed, 2397 insertions, 1704 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index ed48999..bd28f72 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -1,734 +1,759 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s +; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { -; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmax_legacy_ugt_f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmax_legacy_ugt_f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmax_legacy_ugt_f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmax_legacy_ugt_f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-TRUE16-LABEL: test_fmax_legacy_ugt_f16: -; GFX11-NNAN-TRUE16: ; %bb.0: -; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l -; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-FAKE16-LABEL: test_fmax_legacy_ugt_f16: -; GFX11-NNAN-FAKE16: ; %bb.0: -; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmax_legacy_ugt_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt half %a, %b %val = select i1 %cmp, half %a, half %b ret half %val } +define half @test_fmax_legacy_ugt_f16_fast(half %a, half %b) #0 { +; GFX9-LABEL: test_fmax_legacy_ugt_f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16_fast: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16_fast: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ugt half %a, %b + %val = select nnan nsz i1 %cmp, half %a, half %b + ret half %val +} + define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v1.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmax_legacy_ugt_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <2 x half> %a, %b %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b ret <2 x half> %val } +define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: test_fmax_legacy_ugt_v2f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v2f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmax_legacy_ugt_v2f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ugt <2 x half> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v3, v0 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v4, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v5, v2 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v3 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v4 -; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v5 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v2.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v1.l, v3.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmax_legacy_ugt_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_max_legacy_f32_e32 v0, v3, v0 +; SI-NEXT: v_max_legacy_f32_e32 v1, v4, v1 +; SI-NEXT: v_max_legacy_f32_e32 v2, v5, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <3 x half> %a, %b %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b ret <3 x half> %val } +define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 { +; GFX9-LABEL: test_fmax_legacy_ugt_v3f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v3f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v4 +; SI-NEXT: v_max_f32_e32 v2, v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmax_legacy_ugt_v3f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ugt <3 x half> %a, %b + %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b + ret <3 x half> %val +} + define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v7, v3 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v6 -; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.h, v2.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v0.l, v2.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v1.l, v3.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmax_legacy_ugt_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; SI-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; SI-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; SI-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b ret <4 x half> %val } +define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 { +; GFX9-LABEL: test_fmax_legacy_ugt_v4f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v4f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v1, v1, v3 +; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v4 +; SI-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NEXT: v_max_f32_e32 v2, v2, v6 +; SI-NEXT: v_max_f32_e32 v3, v3, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmax_legacy_ugt_v4f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ugt <4 x half> %a, %b + %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v5 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX9-NNAN-NEXT: v_pk_max_f16 v2, v2, v6 -; GFX9-NNAN-NEXT: v_pk_max_f16 v3, v3, v7 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v5 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v3, v3, v7 -; VI-NNAN-NEXT: v_max_f16_e32 v2, v2, v6 -; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5 -; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 -; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v8, v0 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v9, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v10, v2 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v11, v3 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v4, v12, v4 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v5, v13, v5 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v6, v14, v6 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v7, v15, v7 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v8 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v9 -; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v10 -; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v11 -; SI-NNAN-NEXT: v_max_f32_e32 v4, v4, v12 -; SI-NNAN-NEXT: v_max_f32_e32 v5, v5, v13 -; SI-NNAN-NEXT: v_max_f32_e32 v6, v6, v14 -; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v1.h, v5.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v2.h, v6.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v3.h, v7.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s3, v0.l, v4.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s4, v1.l, v5.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s5, v2.l, v6.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s6, v3.l, v7.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.h, v3.h, s2 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.h, v2.h, s1 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7 -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX11-NNAN-NEXT: v_pk_max_f16 v2, v2, v6 -; GFX11-NNAN-NEXT: v_pk_max_f16 v3, v3, v7 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmax_legacy_ugt_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v14, v3, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_max_legacy_f32_e32 v0, v8, v0 +; SI-NEXT: v_max_legacy_f32_e32 v1, v9, v1 +; SI-NEXT: v_max_legacy_f32_e32 v2, v10, v2 +; SI-NEXT: v_max_legacy_f32_e32 v3, v11, v3 +; SI-NEXT: v_max_legacy_f32_e32 v4, v12, v4 +; SI-NEXT: v_max_legacy_f32_e32 v5, v13, v5 +; SI-NEXT: v_max_legacy_f32_e32 v6, v14, v6 +; SI-NEXT: v_max_legacy_f32_e32 v7, v15, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v2.h, v6.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v3.h, v7.h +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s3, v0.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s4, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s5, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s6, v3.l, v7.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.h, v3.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b ret <8 x half> %val } +define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 { +; GFX9-LABEL: test_fmax_legacy_ugt_v8f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX9-NEXT: v_pk_max_f16 v2, v2, v6 +; GFX9-NEXT: v_pk_max_f16 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmax_legacy_ugt_v8f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v3, v3, v7 +; VI-NEXT: v_max_f16_e32 v2, v2, v6 +; VI-NEXT: v_max_f16_e32 v1, v1, v5 +; VI-NEXT: v_max_f16_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_or_b32_e32 v3, v3, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v8 +; SI-NEXT: v_max_f32_e32 v1, v1, v9 +; SI-NEXT: v_max_f32_e32 v2, v2, v10 +; SI-NEXT: v_max_f32_e32 v3, v3, v11 +; SI-NEXT: v_max_f32_e32 v4, v4, v12 +; SI-NEXT: v_max_f32_e32 v5, v5, v13 +; SI-NEXT: v_max_f32_e32 v6, v6, v14 +; SI-NEXT: v_max_f32_e32 v7, v7, v15 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmax_legacy_ugt_v8f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v6 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ugt <8 x half> %a, %b + %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %val +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll index eee2bd1..f3a84e6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -1,8 +1,6 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s @@ -12,12 +10,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] - -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; VI: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { @@ -34,18 +30,38 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_fast: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_uge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 + + %cmp = fcmp uge float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]] ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] -; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]] +; VI: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]] -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { @@ -64,16 +80,40 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %o ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src_fast: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]] +; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]] + +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] + +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + + %cmp = fcmp uge float %a.nnan, %b.nnan + %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] +; VI: v_cmp_ge_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -89,17 +129,35 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32: +; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_oge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 -; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %cmp = fcmp oge float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} +; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI: v_cmp_nle_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -115,16 +173,35 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32_fast: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_ugt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 + + %cmp = fcmp ugt float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] +; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -140,17 +217,35 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32: +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_ogt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 -; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %cmp = fcmp ogt float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -166,23 +261,39 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32_fast: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; EG: MAX +define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile <1 x float>, ptr addrspace(1) %gep.0 + %b = load volatile <1 x float>, ptr addrspace(1) %gep.1 + + %cmp = fcmp ogt <1 x float> %a, %b + %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, ptr addrspace(1) %out + ret void +} + ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32: -; SI-SAFE: v_max_legacy_f32_e32 -; SI-SAFE: v_max_legacy_f32_e32 -; SI-SAFE: v_max_legacy_f32_e32 - -; VI-SAFE: v_cmp_gt_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE: v_cmp_gt_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE: v_cmp_gt_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE-NOT: v_cmp -; VI-SAFE-NOT: v_cndmask - -; GCN-NONAN: v_max_f32_e32 -; GCN-NONAN: v_max_f32_e32 -; GCN-NONAN: v_max_f32_e32 +; SI: v_max_legacy_f32_e32 +; SI: v_max_legacy_f32_e32 +; SI: v_max_legacy_f32_e32 + +; VI: v_cmp_gt_f32_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_gt_f32_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_gt_f32_e32 +; VI: v_cndmask_b32_e32 +; VI-NOT: v_cmp +; VI-NOT: v_cndmask ; GCN-NOT: v_max define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { @@ -199,6 +310,27 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32_fast: + +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 + +; GCN-NOT: v_max +define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1 + + %a = load <3 x float>, ptr addrspace(1) %gep.0 + %b = load <3 x float>, ptr addrspace(1) %gep.1 + + %cmp = fcmp ogt <3 x float> %a, %b + %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, ptr addrspace(1) %out + ret void +} + ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll index 2ac5891..37f077d5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll @@ -1,16 +1,12 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s ; GCN-LABEL: {{^}}min_fneg_select_regression_0: ; GCN-NOT: v_mul -; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 - -; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc +; VI: v_cmp_nle_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, 1.0 @@ -18,15 +14,23 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { ret float %min.a } +; GCN-LABEL: {{^}}min_fneg_select_regression_0_fast: +; GCN-NOT: v_mul + +define amdgpu_ps float @min_fneg_select_regression_0_fast(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ult float %a, 1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0: ; GCN-NOT: v_mul ; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 -; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc - -; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 +; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, -1.0 @@ -34,15 +38,24 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 ret float %min.a } -; GCN-LABEL: {{^}}max_fneg_select_regression_0: +; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0_fast: ; GCN-NOT: v_mul -; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; VI: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 +define amdgpu_ps float @min_fneg_select_regression_posk_0_fast(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ult float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + +; GCN-LABEL: {{^}}max_fneg_select_regression_0: +; GCN-NOT: v_mul -; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc +; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 -; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 +; VI: v_cmp_nge_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ugt float %a, 1.0 @@ -50,15 +63,24 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 { ret float %min.a } -; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0: +; GCN-LABEL: {{^}}max_fneg_select_regression_0_fast: ; GCN-NOT: v_mul -; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 +define amdgpu_ps float @max_fneg_select_regression_0_fast(float %a) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ugt float %a, 1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0 + ret float %min.a +} + +; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0: +; GCN-NOT: v_mul -; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc +; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 -; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 +; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ugt float %a, -1.0 @@ -66,13 +88,22 @@ define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 { ret float %min.a } +; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0_fast: +; GCN-NOT: v_mul + +; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 +define amdgpu_ps float @max_fneg_select_regression_posk_0_fast(float %a) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ugt float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1: ; SI: v_min_legacy_f32_e64 v0, 1.0, -v0 -; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc - -; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -1.0 @@ -80,13 +111,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1_fast: + +; VI: v_min_f32_e64 v0, -v0, 1.0 +define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ugt float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1: ; SI: v_max_legacy_f32_e64 v0, 1.0, -v0 -; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc - -; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0 +; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -1.0 @@ -94,13 +133,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1_fast: + +; VI: v_max_f32_e64 v0, -v0, 1.0 +define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ult float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1: ; SI: v_min_legacy_f32_e64 v0, -v0, 1.0 -; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc - -; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI: v_cmp_lt_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -1.0 @@ -108,13 +155,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1_fast: + +; VI: v_min_f32_e64 v0, -v0, 1.0 +define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ogt float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1: ; SI: v_max_legacy_f32_e64 v0, -v0, 1.0 -; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc - -; VI-NANN: v_max_f32_e64 v0, -v0, 1.0 +; VI: v_cmp_gt_f32_e32 vcc, -1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -1.0 @@ -122,17 +177,24 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1_fast: + +; VI-NANN: v_max_f32_e64 v0, -v0, 1.0 +define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp olt float %a, -1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 ; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0 -; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 -; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc - -; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 +; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 +; VI: v_cmp_nge_f32_e32 vcc, [[K0]], v0 +; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -8.0 @@ -140,17 +202,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8_fast: + +; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]] +define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ugt float %a, -8.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 ; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0 -; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 -; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc - -; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 +; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 +; VI: v_cmp_nle_f32_e32 vcc, [[K0]], v0 +; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -8.0 @@ -158,17 +228,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8_fast: + +; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]] +define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ult float %a, -8.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 ; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]] -; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 -; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc - -; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 +; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 +; VI: v_cmp_lt_f32_e32 vcc, [[K0]], v0 +; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -8.0 @@ -176,18 +254,26 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8_fast: + +; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]] +define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp ogt float %a, -8.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 ; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]] -; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 -; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc - -; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 +; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 +; VI: v_cmp_gt_f32_e32 vcc, [[K0]], v0 +; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -8.0 @@ -195,13 +281,22 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8_fast: + +; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]] +define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp olt float %a, -8.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1: ; SI: v_max_legacy_f32_e64 v0, -v0, -1.0 -; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc - -; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0 +; VI: v_cmp_gt_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, 1.0 @@ -209,15 +304,22 @@ define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 ret float %min.a } +; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1_fast: + +; VI: v_max_f32_e64 v0, -v0, -1.0 +define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp olt float %a, 1.0 + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0 + ret float %min.a +} + ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b: ; SI: v_cmp_nge_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc - -; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; VI: v_cmp_nge_f32_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, %b @@ -225,15 +327,23 @@ define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 { ret float %min.a } +; GCN-LABEL: {{^}}ult_a_select_fneg_a_b_fast: + +; VI: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +define amdgpu_ps float @ult_a_select_fneg_a_b_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp nnan nsz ult float %a, %b + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b + ret float %min.a +} + ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b: ; SI: v_cmp_nle_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc - -; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1 -; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; VI: v_cmp_nle_f32_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, %b @@ -241,5 +351,16 @@ define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 { ret float %min.a } +; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b_fast: + +; VI: v_cmp_gt_f32_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +define amdgpu_ps float @ugt_a_select_fneg_a_b_fast(float %a, float %b) #0 { + %fneg.a = fneg float %a + %cmp.a = fcmp nnan nsz ugt float %a, %b + %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b + ret float %min.a +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 34cb0b1..40c2ec0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -1,735 +1,760 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s +; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { -; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmin_legacy_ule_f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmin_legacy_ule_f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmin_legacy_ule_f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmin_legacy_ule_f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmin_legacy_ule_f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-TRUE16-LABEL: test_fmin_legacy_ule_f16: -; GFX11-NNAN-TRUE16: ; %bb.0: -; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l -; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-FAKE16-LABEL: test_fmin_legacy_ule_f16: -; GFX11-NNAN-FAKE16: ; %bb.0: -; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmin_legacy_ule_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select i1 %cmp, half %a, half %b ret half %val } +define half @test_fmin_legacy_ule_f16_fast(half %a, half %b) #0 { +; GFX9-LABEL: test_fmin_legacy_ule_f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16_fast: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16_fast: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule half %a, %b + %val = select nnan nsz i1 %cmp, half %a, half %b + ret half %val +} + define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v2f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmin_legacy_ule_v2f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmin_legacy_ule_v2f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v2f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v1.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v2f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmin_legacy_ule_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b ret <2 x half> %val } +define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: test_fmin_legacy_ule_v2f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v2f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v2f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmin_legacy_ule_v2f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x half> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v3f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v3, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v4, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v5, v2 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmin_legacy_ule_v3f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v3 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v4 -; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v5 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v3f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v2.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v1.l, v3.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v3f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmin_legacy_ule_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_min_legacy_f32_e32 v0, v3, v0 +; SI-NEXT: v_min_legacy_f32_e32 v1, v4, v1 +; SI-NEXT: v_min_legacy_f32_e32 v2, v5, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v3f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v3f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <3 x half> %a, %b %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b ret <3 x half> %val } +define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 { +; GFX9-LABEL: test_fmin_legacy_ule_v3f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v3f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: v_min_f16_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v3f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v4 +; SI-NEXT: v_min_f32_e32 v2, v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmin_legacy_ule_v3f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <3 x half> %a, %b + %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b + ret <3 x half> %val +} + define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v4f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v7, v3 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmin_legacy_ule_v4f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v6 -; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v4f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.h, v2.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v0.l, v2.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v1.l, v3.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v4f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmin_legacy_ule_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; SI-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; SI-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; SI-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b ret <4 x half> %val } +define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 { +; GFX9-LABEL: test_fmin_legacy_ule_v4f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v4f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v1, v1, v3 +; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v4f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v4 +; SI-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NEXT: v_min_f32_e32 v2, v2, v6 +; SI-NEXT: v_min_f32_e32 v3, v3, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmin_legacy_ule_v4f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <4 x half> %a, %b + %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { -; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v8f16: -; GFX9-SAFE: ; %bb.0: -; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v5 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 -; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 -; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16: -; GFX9-NNAN: ; %bb.0: -; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX9-NNAN-NEXT: v_pk_min_f16 v2, v2, v6 -; GFX9-NNAN-NEXT: v_pk_min_f16 v3, v3, v7 -; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v5 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16: -; VI-NNAN: ; %bb.0: -; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NNAN-NEXT: v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v3, v3, v7 -; VI-NNAN-NEXT: v_min_f16_e32 v2, v2, v6 -; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5 -; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 -; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 -; VI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v8, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v9, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v10, v2 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v11, v3 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v4, v12, v4 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v5, v13, v5 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v6, v14, v6 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v7, v15, v7 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NNAN-LABEL: test_fmin_legacy_ule_v8f16: -; SI-NNAN: ; %bb.0: -; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v8 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v9 -; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v10 -; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v11 -; SI-NNAN-NEXT: v_min_f32_e32 v4, v4, v12 -; SI-NNAN-NEXT: v_min_f32_e32 v5, v5, v13 -; SI-NNAN-NEXT: v_min_f32_e32 v6, v6, v14 -; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15 -; SI-NNAN-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v8f16: -; GFX11-SAFE-TRUE16: ; %bb.0: -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v1.h, v5.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v2.h, v6.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v3.h, v7.h -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s3, v0.l, v4.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s4, v1.l, v5.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s5, v2.l, v6.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s6, v3.l, v7.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.h, v3.h, s2 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.h, v2.h, s1 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6 -; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v8f16: -; GFX11-SAFE-FAKE16: ; %bb.0: -; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7 -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 -; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16: -; GFX11-NNAN: ; %bb.0: -; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX11-NNAN-NEXT: v_pk_min_f16 v2, v2, v6 -; GFX11-NNAN-NEXT: v_pk_min_f16 v3, v3, v7 -; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_fmin_legacy_ule_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v14, v3, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_min_legacy_f32_e32 v0, v8, v0 +; SI-NEXT: v_min_legacy_f32_e32 v1, v9, v1 +; SI-NEXT: v_min_legacy_f32_e32 v2, v10, v2 +; SI-NEXT: v_min_legacy_f32_e32 v3, v11, v3 +; SI-NEXT: v_min_legacy_f32_e32 v4, v12, v4 +; SI-NEXT: v_min_legacy_f32_e32 v5, v13, v5 +; SI-NEXT: v_min_legacy_f32_e32 v6, v14, v6 +; SI-NEXT: v_min_legacy_f32_e32 v7, v15, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v8f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v2.h, v6.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v3.h, v7.h +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s3, v0.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s4, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s5, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e64 s6, v3.l, v7.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.h, v3.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v8f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b ret <8 x half> %val } +define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 { +; GFX9-LABEL: test_fmin_legacy_ule_v8f16_fast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX9-NEXT: v_pk_min_f16 v2, v2, v6 +; GFX9-NEXT: v_pk_min_f16 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fmin_legacy_ule_v8f16_fast: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v3, v3, v7 +; VI-NEXT: v_min_f16_e32 v2, v2, v6 +; VI-NEXT: v_min_f16_e32 v1, v1, v5 +; VI-NEXT: v_min_f16_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_or_b32_e32 v3, v3, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: test_fmin_legacy_ule_v8f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v8 +; SI-NEXT: v_min_f32_e32 v1, v1, v9 +; SI-NEXT: v_min_f32_e32 v2, v2, v10 +; SI-NEXT: v_min_f32_e32 v3, v3, v11 +; SI-NEXT: v_min_f32_e32 v4, v4, v12 +; SI-NEXT: v_min_f32_e32 v5, v5, v13 +; SI-NEXT: v_min_f32_e32 v6, v6, v14 +; SI-NEXT: v_min_f32_e32 v7, v7, v15 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmin_legacy_ule_v8f16_fast: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v6 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <8 x half> %a, %b + %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %val +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll index ec4dd85..defcffa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -1,8 +1,6 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s @@ -14,13 +12,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32: ; EG: MIN * -; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; SI: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} - -; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} - -; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1) %out, <4 x float> %reg0) #0 { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 @@ -30,22 +24,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1) ret void } -; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} +; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32_fast: -; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] +; SI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] +; VI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32_fast(ptr addrspace(1) %out, <4 x float> %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp nnan nsz uge float %r0, %r1 + %r3 = select nnan nsz i1 %r2, float %r1, float %r0 + store float %r3, ptr addrspace(1) %out + ret void +} -; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] +; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} + +; SI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]] +; VI: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] -; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] -; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]] +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]] -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]] +; VI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] +; VI: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b %val = select i1 %cmp, float %a, float %b @@ -53,6 +57,19 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo ret void } +; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32_fast: +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} + +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] + +; GCN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]] +define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, float %a, float %b) #0 { + %cmp = fcmp ule float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + ; Nsz also needed ; FIXME: Should separate tests ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src: @@ -61,12 +78,10 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] - -; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] -; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] +; VI: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] +; VI: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) %out, float %a, float %b) #0 { %a.nnan = fadd nnan float %a, 1.0 %b.nnan = fadd nnan float %b, 2.0 @@ -76,16 +91,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) ret void } +; Nsz also needed +; FIXME: Should separate tests +; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src_fast: +; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} + +; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 +; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 + +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] +define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src_fast(ptr addrspace(1) %out, float %a, float %b) #0 { + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + %cmp = fcmp ule float %a.nnan, %b.nnan + %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] - -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; VI: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -100,16 +131,33 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32: +; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 -; VI-SAFE: v_cmp_le_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %cmp = fcmp ule float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI: v_cmp_le_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -124,16 +172,33 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32: +; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @test_fmin_legacy_ole_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 -; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 + + %cmp = fcmp ole float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; VI: v_cmp_lt_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -148,16 +213,33 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32: +; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @test_fmin_legacy_olt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 -; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + %cmp = fcmp olt float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -172,16 +254,33 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr a ret void } -; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32: +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32_fast: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @test_fmin_legacy_ult_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 -; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]] -; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + %cmp = fcmp ult float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + store float %val, ptr addrspace(1) %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]] +; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid @@ -196,19 +295,35 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr ret void } +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32_fast: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @test_fmin_legacy_ult_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile <1 x float>, ptr addrspace(1) %gep.0 + %b = load volatile <1 x float>, ptr addrspace(1) %gep.1 + + %cmp = fcmp ult <1 x float> %a, %b + %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, ptr addrspace(1) %out + ret void +} + ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32: ; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: {{buffer|flat}}_load_dwordx2 -; SI-SAFE: v_min_legacy_f32_e32 -; SI-SAFE: v_min_legacy_f32_e32 - -; VI-SAFE: v_cmp_nge_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE: v_cmp_nge_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 +; SI: v_min_legacy_f32_e32 +; SI: v_min_legacy_f32_e32 -; GCN-NONAN: v_min_f32_e32 -; GCN-NONAN: v_min_f32_e32 +; VI: v_cmp_nge_f32_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_nge_f32_e32 +; VI: v_cndmask_b32_e32 define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid @@ -223,25 +338,40 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr ret void } +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32_fast: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 + +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_legacy_ult_v2f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr <2 x float>, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile <2 x float>, ptr addrspace(1) %gep.0 + %b = load volatile <2 x float>, ptr addrspace(1) %gep.1 + + %cmp = fcmp ult <2 x float> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + store <2 x float> %val, ptr addrspace(1) %out + ret void +} + ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32: -; SI-SAFE: v_min_legacy_f32_e32 -; SI-SAFE: v_min_legacy_f32_e32 -; SI-SAFE: v_min_legacy_f32_e32 -; SI-SAFE-NOT: v_min_ - -; VI-SAFE: v_cmp_nge_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE: v_cmp_nge_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 -; VI-SAFE: v_cmp_nge_f32_e32 -; VI-SAFE: v_cndmask_b32_e32 +; SI: v_min_legacy_f32_e32 +; SI: v_min_legacy_f32_e32 +; SI: v_min_legacy_f32_e32 +; SI-NOT: v_min_ + +; VI: v_cmp_nge_f32_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_nge_f32_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_nge_f32_e32 +; VI: v_cndmask_b32_e32 ; VI-NOT: v_cmp ; VI-NOT: v_cndmask - -; GCN-NONAN: v_min_f32_e32 -; GCN-NONAN: v_min_f32_e32 -; GCN-NONAN: v_min_f32_e32 -; GCN-NONAN-NOT: v_min_ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid @@ -256,6 +386,28 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr ret void } +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32_fast: +; VI-NOT: v_cmp +; VI-NOT: v_cndmask + +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN-NOT: v_min_ +define amdgpu_kernel void @test_fmin_legacy_ult_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1 + + %a = load <3 x float>, ptr addrspace(1) %gep.0 + %b = load <3 x float>, ptr addrspace(1) %gep.1 + + %cmp = fcmp ult <3 x float> %a, %b + %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, ptr addrspace(1) %out + ret void +} + ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 12e9888..aaea4f7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -5015,7 +5015,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext %a = load volatile double, ptr addrspace(1) %a.gep - %fneg.a = fsub double -0.000000e+00, %a + %fneg.a = fsub nsz double -0.000000e+00, %a %fpround = fptrunc double %fneg.a to float %fneg = fneg float %fpround store float %fneg, ptr addrspace(1) %out.gep diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index c4ca79d..3de6df2 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -4441,25 +4441,40 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) { ret float %i3 } -define float @v_fmul_0_fsub_0_infloop_regression(float %arg) { -; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression: +define float @v_fmul_0_fsub_0_safe_infloop_regression(float %arg) { +; GCN-SAFE-LABEL: v_fmul_0_fsub_0_safe_infloop_regression: ; GCN-SAFE: ; %bb.0: ; %bb ; GCN-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 ; GCN-SAFE-NEXT: v_sub_f32_e32 v0, 0, v0 ; GCN-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression: -; GCN-NSZ: ; %bb.0: ; %bb -; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; GCN-NSZ-NEXT: s_setpc_b64 s[30:31] +; SI-NSZ-LABEL: v_fmul_0_fsub_0_safe_infloop_regression: +; SI-NSZ: ; %bb.0: ; %bb +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: s_brev_b32 s4, 1 +; SI-NSZ-NEXT: v_fma_f32 v0, v0, s4, 0 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] +; FIXME: utils/update_llc_test_checks.py will generate redundant VI +; labels, remove them, they will cause test failure. bb: %i = fmul float %arg, 0.0 %i1 = fsub float 0.0, %i ret float %i1 } +define float @v_fmul_0_fsub_0_nsz_infloop_regression(float %arg) { +; GCN-LABEL: v_fmul_0_fsub_0_nsz_infloop_regression: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +bb: + %i = fmul float %arg, 0.0 + %i1 = fsub nsz float 0.0, %i + ret float %i1 +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll new file mode 100644 index 0000000..d6198f5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll @@ -0,0 +1,9 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +; CHECK: @arbitrary +declare amdgpu_kernel void @arbitrary(ptr addrspace(1)) + +; COM: This used to cause verifier errors when "lowered" +declare <4 x i8> @llvm.masked.load.v4i8.p7(ptr addrspace(7) captures(none), i32 immarg, <4 x i1>, <4 x i8>) +; CHECK-NOT: llvm.masked.load diff --git a/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll new file mode 100644 index 0000000..b508f73 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -passes=amdgpu-late-codegenprepare \ +; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s + +; Goal: With a loop-header PHI in illegal vector type and a same-BB +; non-lookthrough user (vector add) in the header, LRO should still coerce +; the PHI to i32 because a profitable sink (store) exists across BB. + +define amdgpu_kernel void @phi_samebb_nonlookthrough_store( + ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) { +; CHECK-LABEL: @phi_samebb_nonlookthrough_store( +entry: + br label %loop + +loop: ; preds = %entry, %loop + ; Loop-carried PHI in illegal vector type. + %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ] + + ; Same-BB non-lookthrough use in header. + %acc.next = add <4 x i8> %acc, %v + + ; Make it a real loop: either iterate or exit to the sink block. + br i1 %exit, label %store, label %loop + +store: ; preds = %loop + ; The across-BB sink: storing the PHI coerced to i32. + %acc.bc = bitcast <4 x i8> %acc to i32 + store i32 %acc.bc, ptr addrspace(1) %out, align 4 + ret void +} + +; After AMDGPULateCodeGenPrepare we expect: +; - PHI is coerced to i32 +; - A header bitcast materializes for the add +; This proves the same-BB non-lookthrough user (add) did not get pruned +; when the def is a PHI. + +; CHECK: loop: +; CHECK: %[[ACC_TC:[^ ]+]] = phi i32 +; CHECK: %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> +; CHECK: %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v +; CHECK: br i1 %exit, label %store, label %loop +; CHECK: store: +; CHECK: %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> +; CHECK: %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32 +; CHECK: store i32 %[[ST_I32]], + diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 92d3277..bb22144 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -4148,28 +4148,28 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; -------------------------------------------------------------------------------- define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { -; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_add_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_add_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; VI-SAFE: ; %bb.0: @@ -4229,21 +4229,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4302,6 +4287,105 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { +; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v2, -4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %add = fadd nsz <2 x half> %x, <half 4.0, half 4.0> + %fneg = fneg <2 x half> %add + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0> + ret <2 x half> %select +} + define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; CI-SAFE: ; %bb.0: @@ -4704,34 +4788,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < } define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { -; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-SAFE-NEXT: v_mul_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v3, v3, v5 -; CI-SAFE-NEXT: v_mul_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, v2, v4 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_fmad_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_mul_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v3, v3, v5 +; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-SAFE: ; %bb.0: @@ -4793,27 +4877,6 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NSZ-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, v2, v4 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, v3, v5 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4873,6 +4936,112 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { +; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_sub_f32_e32 v2, v2, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_sub_f32_e32 v3, v3, v5 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_fma_f16 v1, v4, -4.0, -v1 +; VI-NEXT: v_fma_f16 v2, v2, -4.0, -v3 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %fmad = call nsz <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z) + %fneg = fneg <2 x half> %fmad + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0> + ret <2 x half> %select +} + declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index c128715..f5dc824 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -116,7 +116,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -125,7 +125,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_a = fsub float 0.0, %a + %neg_a = fsub nsz float 0.0, %a %tmp0 = fmul float %neg_a, %b %tmp1 = fadd float %tmp0, %c @@ -176,7 +176,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -185,7 +185,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_b = fsub float 0.0, %b + %neg_b = fsub nsz float 0.0, %b %tmp0 = fmul float %a, %neg_b %tmp1 = fadd float %tmp0, %c @@ -310,6 +310,5 @@ define float @v_mac_f32_dynamic_ftz(float %a, float %b, float %c) "denormal-fp-m declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll index bcc60b0..8da6f23 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -236,7 +236,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %a.neg = fsub half 0.0, %a.val + %a.neg = fsub nsz half 0.0, %a.val %t.val = fmul half %a.neg, %b.val %r.val = fadd half %t.val, %c.val @@ -263,7 +263,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %b.neg = fsub half 0.0, %b.val + %b.neg = fsub nsz half 0.0, %b.val %t.val = fmul half %a.val, %b.neg %r.val = fadd half %t.val, %c.val @@ -290,7 +290,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %c.neg = fsub half 0.0, %c.val + %c.neg = fsub nsz half 0.0, %c.val %t.val = fmul half %a.val, %b.val %r.val = fadd half %t.val, %c.neg @@ -601,7 +601,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val + %a.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %a.val %t.val = fmul <2 x half> %a.neg, %b.val %r.val = fadd <2 x half> %t.val, %c.val @@ -634,7 +634,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val + %b.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %b.val %t.val = fmul <2 x half> %a.val, %b.neg %r.val = fadd <2 x half> %t.val, %c.val @@ -667,7 +667,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val + %c.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %c.val %t.val = fmul <2 x half> %a.val, %b.val %r.val = fadd <2 x half> %t.val, %c.neg @@ -678,5 +678,5 @@ entry: declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #2 = { nounwind convergent } |