; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s define half @reduction_fadd_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fadd_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fadd_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> poison, <4 x i32> %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x half> %bin.rdx2, i32 0 ret half %res } define half @reduction_fsub_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fsub_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fsub_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v0, v1 ; VI-NEXT: v_sub_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> poison, <4 x i32> %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x half> %bin.rdx2, i32 0 ret half %res } ; Make sure nsz is preserved when the operations are split. define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fsub_v4f16_preserve_fmf: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fsub_v4f16_preserve_fmf: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: v_add_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> poison, <4 x i32> %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x half> %bin.rdx2, i32 0 %neg.res = fsub half -0.0, %res ret half %neg.res } define half @reduction_fmul_half4(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fmul_half4: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fmul_half4: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> poison, <4 x i32> %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x half> %bin.rdx2, i32 0 ret half %res } define i16 @reduction_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: reduction_v4i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_v4i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_e32 v0, v0, v1 ; VI-NEXT: v_add_u16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> poison, <4 x i32> %bin.rdx = add <4 x i16> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> poison, <4 x i32> %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x i16> %bin.rdx2, i32 0 ret i16 %res } define half @reduction_half8(<8 x half> %vec8) { ; GFX9-LABEL: reduction_half8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_half8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v1, v1, v3 ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: v_add_f16_e32 v2, v5, v4 ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> poison, <8 x i32> %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> poison, <8 x i32> %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> poison, <8 x i32> %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3 %res = extractelement <8 x half> %bin.rdx4, i32 0 ret half %res } define i16 @reduction_v8i16(<8 x i16> %vec8) { ; GFX9-LABEL: reduction_v8i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_v8i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_e32 v1, v1, v3 ; VI-NEXT: v_add_u16_e32 v0, v0, v2 ; VI-NEXT: v_add_u16_e32 v2, v5, v4 ; VI-NEXT: v_add_u16_e32 v0, v0, v1 ; VI-NEXT: v_add_u16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> poison, <8 x i32> %bin.rdx = add <8 x i16> %vec8, %rdx.shuf %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> poison, <8 x i32> %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1 %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> poison, <8 x i32> %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3 %res = extractelement <8 x i16> %bin.rdx4, i32 0 ret i16 %res } define half @reduction_half16(<16 x half> %vec16) { ; GFX9-LABEL: reduction_half16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v2, v2, v6 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: v_pk_add_f16 v3, v3, v7 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v5 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_half16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_f16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v2, v2, v6 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_add_f16_e32 v3, v3, v7 ; VI-NEXT: v_add_f16_e32 v1, v1, v5 ; VI-NEXT: v_add_f16_e32 v4, v11, v10 ; VI-NEXT: v_add_f16_e32 v5, v9, v8 ; VI-NEXT: v_add_f16_e32 v1, v1, v3 ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: v_add_f16_e32 v2, v5, v4 ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> poison, <16 x i32> %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> poison, <16 x i32> %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> poison, <16 x i32> %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> poison, <16 x i32> %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5 %res = extractelement <16 x half> %bin.rdx6, i32 0 ret half %res } define i16 @reduction_min_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: reduction_min_v4i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-NEXT: v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_min_v4i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_min_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_u16_e32 v0, v0, v1 ; VI-NEXT: v_min_u16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 ret i16 %res } define i16 @reduction_umin_v8i16(<8 x i16> %vec8) { ; GFX9-LABEL: reduction_umin_v8i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-NEXT: v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_umin_v8i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_min_u16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_u16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_u16_e32 v1, v1, v3 ; VI-NEXT: v_min_u16_e32 v0, v0, v2 ; VI-NEXT: v_min_u16_e32 v2, v5, v4 ; VI-NEXT: v_min_u16_e32 v0, v0, v1 ; VI-NEXT: v_min_u16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> poison, <8 x i32> %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> poison, <8 x i32> %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1 %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> poison, <8 x i32> %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4 %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4 %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0 ret i16 %res } ; Tests to make sure without slp the number of instructions are more. define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) { ; GFX9-LABEL: reduction_umin_v8i16_woslp: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_min3_u16 v0, v4, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_min3_u16 v0, v5, v2, v0 ; GFX9-NEXT: v_min3_u16 v0, v6, v3, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_umin_v8i16_woslp: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_u16_e32 v0, v1, v0 ; VI-NEXT: v_min_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_u16_e32 v0, v2, v0 ; VI-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_u16_e32 v0, v3, v0 ; VI-NEXT: v_min_u16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] entry: %elt0 = extractelement <8 x i16> %vec8, i64 0 %elt1 = extractelement <8 x i16> %vec8, i64 1 %elt2 = extractelement <8 x i16> %vec8, i64 2 %elt3 = extractelement <8 x i16> %vec8, i64 3 %elt4 = extractelement <8 x i16> %vec8, i64 4 %elt5 = extractelement <8 x i16> %vec8, i64 5 %elt6 = extractelement <8 x i16> %vec8, i64 6 %elt7 = extractelement <8 x i16> %vec8, i64 7 %cmp0 = icmp ult i16 %elt1, %elt0 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 %cmp1 = icmp ult i16 %elt2, %min1 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 %cmp2 = icmp ult i16 %elt3, %min2 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 %cmp3 = icmp ult i16 %elt4, %min3 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 %cmp4 = icmp ult i16 %elt5, %min4 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 %cmp5 = icmp ult i16 %elt6, %min5 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 %cmp6 = icmp ult i16 %elt7, %min6 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 ret i16 %min7 } define i16 @reduction_smin_v16i16(<16 x i16> %vec16) { ; GFX9-LABEL: reduction_smin_v16i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v2, v2, v6 ; GFX9-NEXT: v_pk_min_i16 v0, v0, v4 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v7 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v5 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_i16 v0, v0, v2 ; GFX9-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-NEXT: v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_smin_v16i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_min_i16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_i16_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_i16_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_i16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_i16_e32 v2, v2, v6 ; VI-NEXT: v_min_i16_e32 v0, v0, v4 ; VI-NEXT: v_min_i16_e32 v3, v3, v7 ; VI-NEXT: v_min_i16_e32 v1, v1, v5 ; VI-NEXT: v_min_i16_e32 v4, v11, v10 ; VI-NEXT: v_min_i16_e32 v5, v9, v8 ; VI-NEXT: v_min_i16_e32 v1, v1, v3 ; VI-NEXT: v_min_i16_e32 v0, v0, v2 ; VI-NEXT: v_min_i16_e32 v2, v5, v4 ; VI-NEXT: v_min_i16_e32 v0, v0, v1 ; VI-NEXT: v_min_i16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> poison, <16 x i32> %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> poison, <16 x i32> %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1 %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> poison, <16 x i32> %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4 %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4 %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> poison, <16 x i32> %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7 %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7 %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0 ret i16 %res } ; Tests to make sure without slp the number of instructions are more. define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) { ; GFX9-LABEL: reduction_smin_v16i16_woslp: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_min3_i16 v0, v8, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GFX9-NEXT: v_min3_i16 v0, v9, v2, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; GFX9-NEXT: v_min3_i16 v0, v10, v3, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v5 ; GFX9-NEXT: v_min3_i16 v0, v11, v4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; GFX9-NEXT: v_min3_i16 v0, v12, v5, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX9-NEXT: v_min3_i16 v0, v13, v6, v0 ; GFX9-NEXT: v_min3_i16 v0, v14, v7, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_smin_v16i16_woslp: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v1, v0 ; VI-NEXT: v_min_i16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v2, v0 ; VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v3, v0 ; VI-NEXT: v_min_i16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v4, v0 ; VI-NEXT: v_min_i16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v5, v0 ; VI-NEXT: v_min_i16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v6, v0 ; VI-NEXT: v_min_i16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_i16_e32 v0, v7, v0 ; VI-NEXT: v_min_i16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] entry: %elt0 = extractelement <16 x i16> %vec16, i64 0 %elt1 = extractelement <16 x i16> %vec16, i64 1 %elt2 = extractelement <16 x i16> %vec16, i64 2 %elt3 = extractelement <16 x i16> %vec16, i64 3 %elt4 = extractelement <16 x i16> %vec16, i64 4 %elt5 = extractelement <16 x i16> %vec16, i64 5 %elt6 = extractelement <16 x i16> %vec16, i64 6 %elt7 = extractelement <16 x i16> %vec16, i64 7 %elt8 = extractelement <16 x i16> %vec16, i64 8 %elt9 = extractelement <16 x i16> %vec16, i64 9 %elt10 = extractelement <16 x i16> %vec16, i64 10 %elt11 = extractelement <16 x i16> %vec16, i64 11 %elt12 = extractelement <16 x i16> %vec16, i64 12 %elt13 = extractelement <16 x i16> %vec16, i64 13 %elt14 = extractelement <16 x i16> %vec16, i64 14 %elt15 = extractelement <16 x i16> %vec16, i64 15 %cmp0 = icmp slt i16 %elt1, %elt0 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 %cmp1 = icmp slt i16 %elt2, %min1 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 %cmp2 = icmp slt i16 %elt3, %min2 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 %cmp3 = icmp slt i16 %elt4, %min3 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 %cmp4 = icmp slt i16 %elt5, %min4 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 %cmp5 = icmp slt i16 %elt6, %min5 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 %cmp6 = icmp slt i16 %elt7, %min6 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 %cmp7 = icmp slt i16 %elt8, %min7 %min8 = select i1 %cmp7, i16 %elt8, i16 %min7 %cmp8 = icmp slt i16 %elt9, %min8 %min9 = select i1 %cmp8, i16 %elt9, i16 %min8 %cmp9 = icmp slt i16 %elt10, %min9 %min10 = select i1 %cmp9, i16 %elt10, i16 %min9 %cmp10 = icmp slt i16 %elt11, %min10 %min11 = select i1 %cmp10, i16 %elt11, i16 %min10 %cmp11 = icmp slt i16 %elt12, %min11 %min12 = select i1 %cmp11, i16 %elt12, i16 %min11 %cmp12 = icmp slt i16 %elt13, %min12 %min13 = select i1 %cmp12, i16 %elt13, i16 %min12 %cmp13 = icmp slt i16 %elt14, %min13 %min14 = select i1 %cmp13, i16 %elt14, i16 %min13 %cmp14 = icmp slt i16 %elt15, %min14 %min15 = select i1 %cmp14, i16 %elt15, i16 %min14 ret i16 %min15 } define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: reduction_umax_v4i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-NEXT: v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_umax_v4i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_u16_e32 v0, v0, v1 ; VI-NEXT: v_max_u16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 ret i16 %res } define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { ; GFX9-LABEL: reduction_smax_v4i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-NEXT: v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_smax_v4i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_i16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_i16_e32 v0, v0, v1 ; VI-NEXT: v_max_i16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> poison, <4 x i32> %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 ret i16 %res } define half @reduction_maxnum_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_maxnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v2, v3, v2 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> poison, <4 x i32> %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) %res = extractelement <4 x half> %rdx.minmax3, i32 0 ret half %res } define half @reduction_minnum_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_minnum_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_minnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v2, v3, v2 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> poison, <4 x i32> %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) %res = extractelement <4 x half> %rdx.minmax3, i32 0 ret half %res } ; FIXME: Need to preserve fast math flags when fmaxnum matched ; directly from the IR to avoid unnecessary quieting. define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fast_max_pattern_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fast_max_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v2, v3, v2 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> poison, <4 x i32> %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res } ; FIXME: Need to preserve fast math flags when fmaxnum matched ; directly from the IR to avoid unnecessary quieting. define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fast_min_pattern_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: reduction_fast_min_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v2, v3, v2 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> poison, <4 x i32> %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res } declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)