aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-07-16 12:45:58 -0400
committerGitHub <noreply@github.com>2024-07-16 12:45:58 -0400
commit15915c06d54151b78378a7cd029e1a8c09e764b3 (patch)
tree3402f76450c99145ade615a426294e95e0761048
parent8ff233f4f16f4e372cba92ca5175fcd5e1791195 (diff)
downloadllvm-15915c06d54151b78378a7cd029e1a8c09e764b3.zip
llvm-15915c06d54151b78378a7cd029e1a8c09e764b3.tar.gz
llvm-15915c06d54151b78378a7cd029e1a8c09e764b3.tar.bz2
[SLP]Do not vectorize small (<=2) buildvector/buildvalue sequences with MaxVF==true.
If MaxVFOnly for buildvector/buildvalue vectorization is set to true and the total number of elements to vectorize is <= 2, better to try to vectorize reductions at first, which may produce larger tree (reductions have a limit of at least 4 elements to vectorize). Smaller buildvector/buildvalue sequence will be attempted to vectorize later, with MaxVFOnly set to false. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/98957
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp16
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll37
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll3
3 files changed, 35 insertions, 21 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9385bd..7b981be 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18285,6 +18285,14 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
return false;
+ if (MaxVFOnly && BuildVectorOpds.size() == 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
+ << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
+ "trying reduction first.";
+ });
+ return false;
+ }
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register.
return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
@@ -18301,6 +18309,14 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
isFixedVectorShuffle(BuildVectorOpds, Mask)))
return false;
+ if (MaxVFOnly && BuildVectorInsts.size() == 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
+ << "Cannot SLP vectorize list: only 2 elements of buildvector, "
+ "trying reduction first.";
+ });
+ return false;
+ }
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index a24cb81..c6209fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -10,33 +10,32 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
; CHECK: bb18:
-; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
-; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP10]], 2.000000e+00
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP11]], 3.000000e+00
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
; CHECK: bb25:
-; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
; CHECK-NEXT: br label [[BB30:%.*]]
; CHECK: bb30:
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP13]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = uitofp <4 x i8> [[TMP14]] to <4 x float>
-; CHECK-NEXT: [[TMP16:%.*]] = fsub fast <4 x float> [[TMP15]], [[TMP3]]
-; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP12]]
-; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]])
+; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
-; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP18]])
+; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
; CHECK: bb57:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index bf4903f..2f59363 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -7,8 +7,7 @@
; YAML-NEXT: Name: NotPossible
; YAML-NEXT: Function: g
; YAML-NEXT: Args:
-; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible'
-; YAML-NEXT: - String: ' with available vectorization factors'
+; YAML-NEXT: - String: 'Cannot SLP vectorize list: only 2 elements of buildvector, trying reduction first.'
define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
; CHECK-LABEL: @g(