aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h18
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h9
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp12
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h12
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp54
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h4
-rw-r--r--llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp71
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll10
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll130
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll130
14 files changed, 283 insertions, 188 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a405b53..179f0e8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -663,6 +663,12 @@ public:
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked gather.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+ /// Return true if the target forces scalarizing of llvm.masked.gather
+ /// intrinsics.
+ bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const;
+ /// Return true if the target forces scalarizing of llvm.masked.scatter
+ /// intrinsics.
+ bool forceScalarizeMaskedScatter(VectorType *Type, Align Alignment) const;
/// Return true if the target supports masked compress store.
bool isLegalMaskedCompressStore(Type *DataType) const;
@@ -1544,6 +1550,10 @@ public:
virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+ virtual bool forceScalarizeMaskedGather(VectorType *DataType,
+ Align Alignment) = 0;
+ virtual bool forceScalarizeMaskedScatter(VectorType *DataType,
+ Align Alignment) = 0;
virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
virtual bool enableOrderedReductions() = 0;
@@ -1947,6 +1957,14 @@ public:
bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
return Impl.isLegalMaskedGather(DataType, Alignment);
}
+ bool forceScalarizeMaskedGather(VectorType *DataType,
+ Align Alignment) override {
+ return Impl.forceScalarizeMaskedGather(DataType, Alignment);
+ }
+ bool forceScalarizeMaskedScatter(VectorType *DataType,
+ Align Alignment) override {
+ return Impl.forceScalarizeMaskedScatter(DataType, Alignment);
+ }
bool isLegalMaskedCompressStore(Type *DataType) override {
return Impl.isLegalMaskedCompressStore(DataType);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 26a696f..2c70780 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -267,6 +267,15 @@ public:
return false;
}
+ bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const {
+ return false;
+ }
+
+ bool forceScalarizeMaskedScatter(VectorType *DataType,
+ Align Alignment) const {
+ return false;
+ }
+
bool isLegalMaskedCompressStore(Type *DataType) const { return false; }
bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6aa9a77..25e9dee9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -408,6 +408,16 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType,
return TTIImpl->isLegalMaskedScatter(DataType, Alignment);
}
+bool TargetTransformInfo::forceScalarizeMaskedGather(VectorType *DataType,
+ Align Alignment) const {
+ return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment);
+}
+
+bool TargetTransformInfo::forceScalarizeMaskedScatter(VectorType *DataType,
+ Align Alignment) const {
+ return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment);
+}
+
bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
return TTIImpl->isLegalMaskedCompressStore(DataType);
}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 602c674..e0750a9 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1116,18 +1116,6 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
return false;
- // This method is called in 2 places:
- // - from the vectorizer with a scalar type, in which case we need to get
- // this as good as we can with the limited info we have (and rely on the cost
- // model for the rest).
- // - from the masked intrinsic lowering pass with the actual vector type.
- // For MVE, we have a custom lowering pass that will already have custom
- // legalised any gathers that we can to MVE intrinsics, and want to expand all
- // the rest. The pass runs before the masked intrinsic lowering pass, so if we
- // are here, we know we want to expand.
- if (isa<VectorType>(Ty))
- return false;
-
unsigned EltWidth = Ty->getScalarSizeInBits();
return ((EltWidth == 32 && Alignment >= 4) ||
(EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index a56886d..5bb8489 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -189,6 +189,18 @@ public:
return isLegalMaskedLoad(DataTy, Alignment);
}
+ bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+ // For MVE, we have a custom lowering pass that will already have custom
+ // legalised any gathers that we can lower to MVE intrinsics, and want to
+ // expand all the rest. The pass runs before the masked intrinsic lowering
+ // pass.
+ return true;
+ }
+
+ bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+ return forceScalarizeMaskedGather(VTy, Alignment);
+ }
+
bool isLegalMaskedGather(Type *Ty, Align Alignment);
bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 4a766be..5b95c10 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4995,9 +4995,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
const Instruction *I = nullptr) {
if (CostKind != TTI::TCK_RecipThroughput) {
if ((Opcode == Instruction::Load &&
- isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
+ !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
+ Align(Alignment))) ||
(Opcode == Instruction::Store &&
- isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
+ !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
+ Align(Alignment))))
return 1;
return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
Alignment, CostKind, I);
@@ -5012,9 +5016,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
unsigned AddressSpace = PtrTy->getAddressSpace();
if ((Opcode == Instruction::Load &&
- !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
+ forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
+ Align(Alignment)))) ||
(Opcode == Instruction::Store &&
- !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
+ forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
+ Align(Alignment)))))
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
AddressSpace);
@@ -5137,35 +5145,21 @@ bool X86TTIImpl::supportsGather() const {
return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
}
+bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
+ // it to 8 elements, but zeroing upper bits of the mask vector will add more
+ // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
+ // Check, maybe the gather/scatter instruction is better in the VariableMask
+ // case.
+ unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
+ return NumElts == 1 ||
+ (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
+}
+
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
if (!supportsGather())
return false;
-
- // This function is called now in two cases: from the Loop Vectorizer
- // and from the Scalarizer.
- // When the Loop Vectorizer asks about legality of the feature,
- // the vectorization factor is not calculated yet. The Loop Vectorizer
- // sends a scalar type and the decision is based on the width of the
- // scalar element.
- // Later on, the cost model will estimate usage this intrinsic based on
- // the vector type.
- // The Scalarizer asks again about legality. It sends a vector type.
- // In this case we can reject non-power-of-2 vectors.
- // We also reject single element vectors as the type legalizer can't
- // scalarize it.
- if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
- unsigned NumElts = DataVTy->getNumElements();
- if (NumElts == 1)
- return false;
- // Gather / Scatter for vector 2 is not profitable on KNL / SKX
- // Vector-4 of gather/scatter instruction does not exist on KNL.
- // We can extend it to 8 elements, but zeroing upper bits of
- // the mask vector will add more instructions. Right now we give the scalar
- // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter
- // instruction is better in the VariableMask case.
- if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())))
- return false;
- }
Type *ScalarTy = DataTy->getScalarType();
if (ScalarTy->isPointerTy())
return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 11e9cb0..6971507 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -226,6 +226,10 @@ public:
bool isLegalMaskedStore(Type *DataType, Align Alignment);
bool isLegalNTLoad(Type *DataType, Align Alignment);
bool isLegalNTStore(Type *DataType, Align Alignment);
+ bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
+ bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+ return forceScalarizeMaskedGather(VTy, Alignment);
+ }
bool isLegalMaskedGather(Type *DataType, Align Alignment);
bool isLegalMaskedScatter(Type *DataType, Align Alignment);
bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 1284bae..29cea42 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -959,7 +959,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
Type *LoadTy = CI->getType();
Align Alignment = DL.getValueOrABITypeAlignment(MA,
LoadTy->getScalarType());
- if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+ if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
+ !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
return false;
scalarizeMaskedGather(DL, CI, DTU, ModifiedDT);
return true;
@@ -970,7 +971,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
Type *StoreTy = CI->getArgOperand(0)->getType();
Align Alignment = DL.getValueOrABITypeAlignment(MA,
StoreTy->getScalarType());
- if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+ if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
+ !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
+ Alignment))
return false;
scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT);
return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e48f951..160c5ce 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1543,13 +1543,16 @@ public:
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
+ bool isLegalGatherOrScatter(Value *V,
+ ElementCount VF = ElementCount::getFixed(1)) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
return false;
auto *Ty = getLoadStoreType(V);
Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
@@ -1564,16 +1567,17 @@ public:
}
/// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- /// If a non-zero VF has been calculated, we check if I will be scalarized
- /// predication for that VF.
- bool isScalarWithPredication(Instruction *I) const;
+ /// predication when vectorizing \p I with vectorization factor \p VF. Such
+ /// instructions include conditional stores and instructions that may divide
+ /// by zero.
+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
+ // \p VF is the vectorization factor that will be used to vectorize \p I.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+ bool isPredicatedInst(Instruction *I, ElementCount VF,
+ bool IsKnownUniform = false) {
// When we know the load is uniform and the original scalar loop was not
// predicated we don't need to mark it as a predicated instruction. Any
// vectorised blocks created when tail-folding are something artificial we
@@ -1589,7 +1593,7 @@ public:
// instructions.
if (isa<LoadInst>(I) || isa<StoreInst>(I))
return Legal->isMaskRequired(I);
- return isScalarWithPredication(I);
+ return isScalarWithPredication(I, VF);
}
/// Returns true if \p I is a memory instruction with consecutive memory
@@ -1781,7 +1785,7 @@ private:
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
- bool useEmulatedMaskMemRefHack(Instruction *I);
+ bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
@@ -4864,7 +4868,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
+bool LoopVectorizationCostModel::isScalarWithPredication(
+ Instruction *I, ElementCount VF) const {
if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -4876,11 +4881,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getLoadStoreType(I);
+ Type *VTy = Ty;
+ if (VF.isVector())
+ VTy = VectorType::get(Ty, VF);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedGather(Ty, Alignment))
+ TTI.isLegalMaskedGather(VTy, Alignment))
: !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedScatter(Ty, Alignment));
+ TTI.isLegalMaskedScatter(VTy, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -4953,7 +4961,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// If the instruction is a store located in a predicated block, it will be
// scalarized.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If the instruction's allocated size doesn't equal it's type size, it
@@ -5004,7 +5012,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I)) {
+ if (isScalarWithPredication(I, VF)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
return;
@@ -6433,7 +6441,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
return RUs;
}
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
+ ElementCount VF) {
// TODO: Cost model for emulated masked load/store is completely
// broken. This hack guides the cost model to use an artificially
// high enough value to practically disable vectorization with such
@@ -6442,8 +6451,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I) &&
- "Expecting a scalar emulated instruction");
+ assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -6470,13 +6478,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
if (!blockNeedsPredicationForAnyReason(BB))
continue;
for (Instruction &I : *BB)
- if (isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I, VF)) {
ScalarCostsTy ScalarCosts;
// Do not apply discount if scalable, because that would lead to
// invalid scalarization costs.
// Do not apply discount logic if hacked cost is needed
// for emulated masked memrefs.
- if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
+ if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
@@ -6512,7 +6520,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6559,7 +6567,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -6722,7 +6730,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
- if (isPredicatedInst(I)) {
+ if (isPredicatedInst(I, VF)) {
Cost /= getReciprocalPredBlockProb();
// Add the cost of an i1 extract and a branch
@@ -6733,7 +6741,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
/*Insert=*/false, /*Extract=*/true);
Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
- if (useEmulatedMaskMemRefHack(I))
+ if (useEmulatedMaskMemRefHack(I, VF))
// Artificially setting to a high enough value to practically disable
// vectorization with such operations.
Cost = 3000000;
@@ -7140,7 +7148,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// predicated uniform stores. Today they are treated as any other
// predicated store (see added test cases in
// invariant-store-vectorization.ll).
- if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;
if (Legal->isUniformMemOp(I)) {
@@ -7150,7 +7158,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
InstructionCost Cost;
if (isa<StoreInst>(&I) && VF.isScalable() &&
- isLegalGatherOrScatter(&I)) {
+ isLegalGatherOrScatter(&I, VF)) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
@@ -7192,7 +7200,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
InstructionCost GatherScatterCost =
- isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I, VF)
? getGatherScatterCost(&I, VF) * NumAccesses
: InstructionCost::getInvalid();
@@ -7395,7 +7403,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF.isVector() && isScalarWithPredication(I)) {
+ if (VF.isVector() && isScalarWithPredication(I, VF)) {
InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -8568,7 +8576,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
VFRange &Range) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+ [this, CI](ElementCount VF) {
+ return CM.isScalarWithPredication(CI, VF);
+ },
Range);
if (IsPredicated)
@@ -8608,7 +8618,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
// scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+ CM.isProfitableToScalarize(I, VF) ||
+ CM.isScalarWithPredication(I, VF);
};
return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
Range);
@@ -8682,7 +8693,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
Range);
// Even if the instruction is not marked as uniform, there are certain
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index e7801a9..a80fb28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -90,4 +90,4 @@ for.end: ; preds = %for.inc, %entry
ret void
}
-attributes #0 = { "target-features"="+neon,+sve,+v8.1a" }
+attributes #0 = { "target-features"="+neon,+sve,+v8.1a" vscale_range(2, 0) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index b9a48e3..c765699 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -125,7 +125,7 @@ for.inc: ; preds = %for.body, %if.then
br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
}
-attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"}
+attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve" vscale_range(2,0) }
!0 = distinct !{!0, !1, !2, !3, !4, !5}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index 213b439..c474551 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -2,8 +2,8 @@
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
@b = global [8 x i32] zeroinitializer, align 16
@@ -32,6 +32,12 @@ define void @foo() {
; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
; AVX-NEXT: ret void
;
+; AVX512-LABEL: @foo(
+; AVX512-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> <i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2)>, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
+; AVX512-NEXT: ret void
+;
%1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
store i32 %1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
%2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index b501ab1..f6dd752 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -105,19 +105,23 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
; AVX512F-LABEL: @gather_load_2(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
@@ -255,35 +259,43 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
;
; AVX512F-LABEL: @gather_load_3(
; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
-; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
-; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1
-; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2
-; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
-; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2
+; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3
+; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4
+; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
+; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1
+; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2
+; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3
+; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4
+; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_3(
@@ -457,13 +469,19 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_4(
+; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
+; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
+; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
@@ -473,20 +491,22 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
-; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
-; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1
-; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2
-; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3
-; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
+; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2
+; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3
+; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4
+; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1
+; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
+; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
+; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
+; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_4(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 5c125a1..fd1c612 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -105,19 +105,23 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
; AVX512F-LABEL: @gather_load_2(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
@@ -255,35 +259,43 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
;
; AVX512F-LABEL: @gather_load_3(
; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
-; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
-; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1
-; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2
-; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
-; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2
+; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3
+; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4
+; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
+; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1
+; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2
+; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3
+; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4
+; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_3(
@@ -457,13 +469,19 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_4(
+; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
+; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
+; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
@@ -473,20 +491,22 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
-; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
-; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1
-; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2
-; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3
-; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
-; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
+; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2
+; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3
+; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4
+; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1
+; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
+; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
+; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
+; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_4(