diff options
author | Jim Lin <jim@andestech.com> | 2025-03-04 16:49:24 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-04 16:49:24 +0800 |
commit | 03505a004ff6909c46d6b8c498a9ffccd47d88a0 (patch) | |
tree | 746a19d5c27e87e443b97eb387f36ee0064231bb | |
parent | 47fb9c4bb9b057ab45c5228937a2c1fbf51c4f72 (diff) | |
download | llvm-03505a004ff6909c46d6b8c498a9ffccd47d88a0.zip llvm-03505a004ff6909c46d6b8c498a9ffccd47d88a0.tar.gz llvm-03505a004ff6909c46d6b8c498a9ffccd47d88a0.tar.bz2 |
[RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin (#129629)
This PR enable scalable loop vectorization for fmax and fmin reductions
with f16/bf16 type when only zvfhmin/zvfbfmin are enabled.
After https://github.com/llvm/llvm-project/pull/128800, we can promote
the fmax/fmin reductions with f16/bf16 type to f32 reductions for
zvfhmin/zvfbfmin.
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 17 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll | 130 |
2 files changed, 138 insertions, 9 deletions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 3f57560..020a2b8 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -349,15 +349,8 @@ public: if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, Ty))) return false; - // We can't promote f16/bf16 fadd reductions and scalable vectors can't be - // expanded. - // TODO: Promote f16/bf16 fmin/fmax reductions - if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16())) - return false; - switch (RdxDesc.getRecurrenceKind()) { case RecurKind::Add: - case RecurKind::FAdd: case RecurKind::And: case RecurKind::Or: case RecurKind::Xor: @@ -365,11 +358,17 @@ public: case RecurKind::SMax: case RecurKind::UMin: case RecurKind::UMax: + case RecurKind::IAnyOf: case RecurKind::FMin: case RecurKind::FMax: - case RecurKind::FMulAdd: - case RecurKind::IAnyOf: + return true; case RecurKind::FAnyOf: + case RecurKind::FAdd: + case RecurKind::FMulAdd: + // We can't promote f16/bf16 fadd reductions and scalable vectors can't be + // expanded. + if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16())) + return false; return true; default: return false; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll index 01a2a75..c3c2b8e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll @@ -344,6 +344,70 @@ for.end: ret float %.sroa.speculated } +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 { +; CHECK-LABEL: @fmin_fast +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half> +; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half> +; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD1]] +; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD2]] +; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]] +; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x half> %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]] +; CHECK-NEXT: call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> %[[SEL]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %cmp.i = fcmp olt half %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret half %.sroa.speculated +} + +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 { +; CHECK-LABEL: @fmin_fast +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat> +; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat> +; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD1]] +; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD2]] +; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]] +; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]] +; CHECK-NEXT: call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv + %0 = load bfloat, ptr %arrayidx, align 4 + %cmp.i = fcmp olt bfloat %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret bfloat %.sroa.speculated +} + ; FMAX (FAST) ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) @@ -378,6 +442,70 @@ for.end: ret float %.sroa.speculated } +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 { +; CHECK-LABEL: @fmax_fast +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half> +; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half> +; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD1]] +; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD2]] +; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]] +; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x half> %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]] +; CHECK-NEXT: call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> %[[SEL]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %cmp.i = fcmp fast ogt half %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret half %.sroa.speculated +} + +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 { +; CHECK-LABEL: @fmax_fast +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat> +; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat> +; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD1]] +; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD2]] +; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]] +; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]] +; CHECK-NEXT: call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv + %0 = load bfloat, ptr %arrayidx, align 4 + %cmp.i = fcmp fast ogt bfloat %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret bfloat %.sroa.speculated +} + ; Reduction cannot be vectorized ; MUL @@ -591,6 +719,8 @@ for.end: declare float @llvm.fmuladd.f32(float, float, float) attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } +attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfhmin,+zvfhmin"} +attributes #2 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfbfmin,+zvfbfmin"} !0 = distinct !{!0, !1, !2, !3, !4} !1 = !{!"llvm.loop.vectorize.width", i32 8} |