aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
authorFlorian Hahn <flo@fhahn.com>2024-06-19 17:06:52 +0100
committerFlorian Hahn <flo@fhahn.com>2024-06-19 17:06:52 +0100
commitb9702bb12f19b7bdb7f4420f94ee603d29d1bf1b (patch)
tree829f59db55f8fa1169977d147ff1229c9b8d46f1 /llvm
parent04cd06906288dcb148de37d7c3e6c009c3e3b726 (diff)
downloadllvm-b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b.zip
llvm-b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b.tar.gz
llvm-b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b.tar.bz2
[LV] Consider insts feeding interleave group pointers free.
For interleave groups, we only generate a pointer for the start of the interleave group (the instruction at the insert position). The other addresses for other members are alreayd considered free, but so are their operands, if they are only used in address computations for other interleave group members.
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp30
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll28
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/pr47437.ll2
3 files changed, 34 insertions, 26 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4080837..9571cfe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7048,6 +7048,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ SmallSetVector<Value *, 4> DeadInterleavePointerOps;
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
// Find all stores to invariant variables. Since they are going to sink
@@ -7058,25 +7059,32 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
ValuesToIgnore.insert(&I);
// For interleave groups, we only create a pointer for the start of the
- // interleave group. Mark single-use ops feeding interleave group mem ops
- // as free when vectorizing, expect the insert-pos memory op.
+ // interleave group. Queue up addresses of group members except the insert
+ // position for further processing.
if (isAccessInterleaved(&I)) {
auto *Group = getInterleavedAccessGroup(&I);
if (Group->getInsertPos() == &I)
continue;
Value *PointerOp = getLoadStorePointerOperand(&I);
- SmallSetVector<Value *, 4> Worklist;
- Worklist.insert(PointerOp);
- for (unsigned I = 0; I != Worklist.size(); ++I) {
- auto *Op = dyn_cast<Instruction>(Worklist[I]);
- if (!Op || !TheLoop->contains(Op) || !Op->hasOneUse())
- continue;
- VecValuesToIgnore.insert(Op);
- Worklist.insert(Op->op_begin(), Op->op_end());
- }
+ DeadInterleavePointerOps.insert(PointerOp);
}
}
+ // Mark ops feeding interleave group members as free, if they are only used
+ // by other dead computations.
+ for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
+ auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
+ if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
+ Instruction *UI = cast<Instruction>(U);
+ return !VecValuesToIgnore.contains(U) &&
+ (!isAccessInterleaved(UI) ||
+ getInterleavedAccessGroup(UI)->getInsertPos() == UI);
+ }))
+ continue;
+ VecValuesToIgnore.insert(Op);
+ DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end());
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (const auto &Reduction : Legal->getReductionVars()) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index fe339cb..0091ebd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -86,29 +86,29 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <4 x float> poison, float [[TMP42]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT33]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
-; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT34]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP46]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
+; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT35]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
+; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC37:%.*]] = shufflevector <16 x float> [[TMP52]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC37]], ptr [[TMP50]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP50]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 97e1c1c..7cbf0ab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -3,7 +3,7 @@
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE41
define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
; SSE2-LABEL: @test_muladd(