diff options
author | David Sherwood <david.sherwood@arm.com> | 2023-04-25 08:46:41 +0000 |
---|---|---|
committer | David Sherwood <david.sherwood@arm.com> | 2023-05-18 10:35:57 +0000 |
commit | c7dbe326dff81273eabe339fe69cd7bef947619c (patch) | |
tree | 600d5c957ee03fb2dd93f83e8c69a1ae4252aed9 | |
parent | 01efcec6dbd1431d2ac112f537d5639a9eab18b2 (diff) | |
download | llvm-c7dbe326dff81273eabe339fe69cd7bef947619c.zip llvm-c7dbe326dff81273eabe339fe69cd7bef947619c.tar.gz llvm-c7dbe326dff81273eabe339fe69cd7bef947619c.tar.bz2 |
[AArch64][LoopVectorize] Enable tail-folding of simple loops on neoverse-v1
This patch enables the tail-folding of simple loops by default
when targeting the neoverse-v1 CPU. Simple loops exclude those
with recurrences or reductions or loops that are reversed.
New tests have been added here:
Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
In terms of SPEC2017 only one benchmark is really affected when
building with "-Ofast -mcpu=neoverse-v1 -flto", which is
(+ faster, - slower):
525.x264: +7.0%
Differential Revision: https://reviews.llvm.org/D130618
5 files changed, 74 insertions, 14 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 843c3b7..670f84d 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -226,6 +226,7 @@ void AArch64Subtarget::initializeProperties() { PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; VScaleForTuning = 2; + DefaultSVETFOpts = TailFoldingOpts::Simple; break; case Neoverse512TVB: PrefFunctionAlignment = Align(16); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c326c12f..385ba9a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -39,6 +39,9 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); +static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", + cl::init(15), cl::Hidden); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -3558,8 +3561,19 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; - return TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), - Required); + if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), + Required)) + return false; + + // Don't tail-fold for tight loops where we would be better off interleaving + // with an unpredicated loop. + unsigned NumInsns = 0; + for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { + NumInsns += BB->sizeWithoutDebug(); + } + + // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. + return NumInsns >= SVETailFoldInsnThreshold; } InstructionCost diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll index c65b10c..454a978 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll @@ -1,7 +1,5 @@ ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG -; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG +; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ ; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll index c3348fd..01f4d09 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -1,11 +1,16 @@ -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV -; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences+reverse | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 +; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 target triple = "aarch64-unknown-linux-gnu" @@ -58,6 +63,14 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1> ; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[SPLAT]], ptr +; CHECK-NEOVERSE-V1-LABEL: @simple_memset( +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0 +; CHECK-NEOVERSE-V1: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1> +; CHECK-NEOVERSE-V1: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + entry: br label %while.body @@ -129,6 +142,15 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]] ; CHECK-TF-ONLYRED: middle.block: ; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]]) + +; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1> +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <vscale x 4 x float> +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]] +; CHECK-NEOVERSE-V1: middle.block: +; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]]) + entry: br label %for.body @@ -225,6 +247,19 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]] ; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[ADD]] +; CHECK-NEOVERSE-V1-LABEL: @add_recur +; CHECK-NEOVERSE-V1: entry: +; CHECK-NEOVERSE-V1: %[[PRE:.*]] = load i32, ptr %src, align 4 +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]] +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1> +; CHECK-NEOVERSE-V1: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-NEOVERSE-V1: %[[LOAD]] = load <vscale x 4 x i32> +; CHECK-NEOVERSE-V1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1) +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]] +; CHECK-NEOVERSE-V1: store <vscale x 4 x i32> %[[ADD]] + entry: %.pre = load i32, ptr %src, align 4 br label %for.body @@ -276,6 +311,12 @@ define void @interleave(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> +; CHECK-NEOVERSE-V1-LABEL: @interleave( +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <8 x float>, ptr +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + entry: br label %for.body @@ -335,6 +376,12 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 { ; CHECK-TF-NOREC: %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]]) ; CHECK-TF-NOREC: %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse +; CHECK-TF-NEOVERSE-V1-LABEL: @reverse( +; CHECK-TF-NEOVERSE-V1: vector.body: +; CHECK-TF-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1> +; CHECK-TF-NEOVERSE-V1: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8 +; CHECK-TF-NEOVERSE-V1: %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]]) + entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index 81ceb0a..45fca16 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding=all -S < %s | FileCheck %s +; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s target triple = "aarch64" |