diff options
3 files changed, 107 insertions, 130 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 11a4aa4..da0798e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4159,6 +4159,26 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, switch (ISD) { default: break; + case ISD::FADD: + if (Type *EltTy = ValTy->getScalarType(); + // FIXME: For half types without fullfp16 support, this could extend and + // use a fp32 faddp reduction but current codegen unrolls. + MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || + (EltTy->isHalfTy() && ST->hasFullFP16()))) { + const unsigned NElts = MTy.getVectorNumElements(); + if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && + isPowerOf2_32(NElts)) + // Reduction corresponding to series of fadd instructions is lowered to + // series of faddp instructions. faddp has latency/throughput that + // matches fadd instruction and hence, every faddp instruction can be + // considered to have a relative cost = 1 with + // CostKind = TCK_RecipThroughput. + // An faddp will pairwise add vector elements, so the size of input + // vector reduces by half every time, requiring + // #(faddp instructions) = log2_32(NElts). + return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts); + } + break; case ISD::ADD: if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) return (LT.first - 1) + Entry->Cost; diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll index 58cb8c2..a95542f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -76,49 +76,49 @@ define void @fast_fp_reductions() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; FP16-LABEL: 'fast_fp_reductions' -; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -134,20 +134,20 @@ define void @fast_fp_reductions() { ; BF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index edc0381..6dceabe 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \ -; RUN: -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16 -; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \ -; RUN: -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FP16 +; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16 +; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16 define half @reduce_fast_half2(<2 x half> %vec2) { ; CHECK-LABEL: define half @reduce_fast_half2( @@ -79,20 +77,26 @@ entry: } define half @reduce_fast_half8(<8 x half> %vec8) { -; CHECK-LABEL: define half @reduce_fast_half8( -; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 -; CHECK-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 -; CHECK-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 -; CHECK-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]] -; CHECK-NEXT: ret half [[OP_RDX3]] +; NOFP16-LABEL: define half @reduce_fast_half8( +; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { +; NOFP16-NEXT: [[ENTRY:.*:]] +; NOFP16-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 +; NOFP16-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 +; NOFP16-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 +; NOFP16-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 +; NOFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; NOFP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) +; NOFP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]] +; NOFP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]] +; NOFP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] +; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]] +; NOFP16-NEXT: ret half [[OP_RDX3]] +; +; FULLFP16-LABEL: define half @reduce_fast_half8( +; FULLFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { +; FULLFP16-NEXT: [[ENTRY:.*:]] +; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[VEC8]]) +; FULLFP16-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <8 x half> %vec8, i64 0 @@ -154,37 +158,11 @@ entry: } define half @reduce_fast_half16(<16 x half> %vec16) { -; NOFP16-LABEL: define half @reduce_fast_half16( -; NOFP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { -; NOFP16-NEXT: [[ENTRY:.*:]] -; NOFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]]) -; NOFP16-NEXT: ret half [[TMP0]] -; -; FP16-LABEL: define half @reduce_fast_half16( -; FP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { -; FP16-NEXT: [[ENTRY:.*:]] -; FP16-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4 -; FP16-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5 -; FP16-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6 -; FP16-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7 -; FP16-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12 -; FP16-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13 -; FP16-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14 -; FP16-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15 -; FP16-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; FP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) -; FP16-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11> -; FP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]]) -; FP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]] -; FP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]] -; FP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]] -; FP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]] -; FP16-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]] -; FP16-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]] -; FP16-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]] -; FP16-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]] -; FP16-NEXT: [[OP_RDX8:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]] -; FP16-NEXT: ret half [[OP_RDX8]] +; CHECK-LABEL: define half @reduce_fast_half16( +; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]]) +; CHECK-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <16 x half> %vec16, i64 0 @@ -512,19 +490,11 @@ define float @reduce_fast_float_case1(ptr %a) { ; CHECK-LABEL: define float @reduce_fast_float_case1( ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4 -; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[LOAD1]], [[LOAD]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8 -; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOAD2]], [[ADD1]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12 -; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOAD3]], [[ADD2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16 ; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4 -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[LOAD4]], [[ADD3]] +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]] ; CHECK-NEXT: ret float [[ADD4]] ; entry: @@ -586,24 +556,11 @@ define float @reduce_fast_float_case2(ptr %a, ptr %b) { ; CHECK-LABEL: define float @reduce_fast_float_case2( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3 -; CHECK-NEXT: [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3 -; CHECK-NEXT: [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4 -; CHECK-NEXT: [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4 -; CHECK-NEXT: [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4 -; CHECK-NEXT: [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOADA3]], [[LOADB2]] -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOADA2]], [[LOADB3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RED1:%.*]] = fadd fast float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[RED2:%.*]] = fadd fast float [[ADD2]], [[RED1]] -; CHECK-NEXT: [[RED3:%.*]] = fadd fast float [[ADD3]], [[RED2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[TMP0]], i64 4) +; CHECK-NEXT: [[RED3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) ; CHECK-NEXT: ret float [[RED3]] ; entry: |
