aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-03-07 12:44:25 -0800
committerAlexey Bataev <a.bataev@outlook.com>2024-03-07 12:44:53 -0800
commit11185715a28c6592ca6fe247fe693b305c85627a (patch)
tree7d193f41b9dae8106782a1869a40ac838bec584c
parent458636690afdd223ffa72f49164f30449b588892 (diff)
downloadllvm-11185715a28c6592ca6fe247fe693b305c85627a.zip
llvm-11185715a28c6592ca6fe247fe693b305c85627a.tar.gz
llvm-11185715a28c6592ca6fe247fe693b305c85627a.tar.bz2
Revert "[SLP]Improve minbitwidth analysis."
This reverts commit 4ce52e2d576937fe930294cae883a0daa17eeced to fix issues detected by https://lab.llvm.org/buildbot/#/builders/74/builds/26470/steps/12/logs/stdio.
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp634
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll20
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll7
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll17
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll21
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll43
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll24
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/resched.ll32
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll10
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll22
-rw-r--r--llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll4
15 files changed, 305 insertions, 553 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1889bc0..36dc909 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1085,9 +1085,6 @@ public:
BS->clear();
}
MinBWs.clear();
- ReductionBitWidth = 0;
- CastMaxMinBWSizes.reset();
- TruncNodes.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
PostponedGathers.clear();
@@ -2290,7 +2287,6 @@ public:
void clearReductionData() {
AnalyzedReductionsRoots.clear();
AnalyzedReductionVals.clear();
- AnalyzedMinBWVals.clear();
}
/// Checks if the given value is gathered in one of the nodes.
bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
@@ -2311,11 +2307,9 @@ private:
/// constant and to be demoted. Required to correctly identify constant nodes
/// to be demoted.
bool collectValuesToDemote(
- Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
- SmallVectorImpl<Value *> &ToDemote,
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
- bool &IsProfitableToDemote) const;
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
@@ -2381,10 +2375,6 @@ private:
/// \ returns the graph entry for the \p Idx operand of the \p E entry.
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
- /// \returns Cast context for the given graph node.
- TargetTransformInfo::CastContextHint
- getCastContextHint(const TreeEntry &TE) const;
-
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,
@@ -2935,18 +2925,11 @@ private:
}
assert(!BundleMember && "Bundle and VL out of sync");
} else {
+ MustGather.insert(VL.begin(), VL.end());
// Build a map for gathered scalars to the nodes where they are used.
- bool AllConstsOrCasts = true;
for (Value *V : VL)
- if (!isConstant(V)) {
- auto *I = dyn_cast<CastInst>(V);
- AllConstsOrCasts &= I && I->getType()->isIntegerTy();
+ if (!isConstant(V))
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
- }
- if (AllConstsOrCasts)
- CastMaxMinBWSizes =
- std::make_pair(std::numeric_limits<unsigned>::max(), 1);
- MustGather.insert(VL.begin(), VL.end());
}
if (UserTreeIdx.UserTE)
@@ -3071,10 +3054,6 @@ private:
/// Set of hashes for the list of reduction values already being analyzed.
DenseSet<size_t> AnalyzedReductionVals;
- /// Values, already been analyzed for mininmal bitwidth and found to be
- /// non-profitable.
- DenseSet<Value *> AnalyzedMinBWVals;
-
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
@@ -3650,18 +3629,6 @@ private:
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
-
- /// Final size of the reduced vector, if the current graph represents the
- /// input for the reduction and it was possible to narrow the size of the
- /// reduction.
- unsigned ReductionBitWidth = 0;
-
- /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
- /// type sizes, used in the tree.
- std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
-
- /// Indices of the vectorized trunc nodes.
- DenseSet<unsigned> TruncNodes;
};
} // end namespace slpvectorizer
@@ -6572,29 +6539,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
- std::make_pair(std::numeric_limits<unsigned>::min(),
- std::numeric_limits<unsigned>::max()));
- if (ShuffleOrOp == Instruction::ZExt ||
- ShuffleOrOp == Instruction::SExt) {
- CastMaxMinBWSizes = std::make_pair(
- std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
- PrevMaxBW),
- std::min<unsigned>(
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
- PrevMinBW));
- } else if (ShuffleOrOp == Instruction::Trunc) {
- CastMaxMinBWSizes = std::make_pair(
- std::max<unsigned>(
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
- PrevMaxBW),
- std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
- PrevMinBW));
- TruncNodes.insert(VectorizableTree.size());
- }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
-
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
TE->setOperandsInOrder();
@@ -8416,22 +8362,6 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
return It->get();
}
-TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
- if (TE.State == TreeEntry::ScatterVectorize ||
- TE.State == TreeEntry::StridedVectorize)
- return TTI::CastContextHint::GatherScatter;
- if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
- !TE.isAltShuffle()) {
- if (TE.ReorderIndices.empty())
- return TTI::CastContextHint::Normal;
- SmallVector<int> Mask;
- inversePermutation(TE.ReorderIndices, Mask);
- if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
- return TTI::CastContextHint::Reversed;
- }
- return TTI::CastContextHint::None;
-}
-
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -8454,7 +8384,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
auto It = MinBWs.find(E);
- Type *OrigScalarTy = ScalarTy;
if (It != MinBWs.end()) {
ScalarTy = IntegerType::get(F->getContext(), It->second.first);
VecTy = FixedVectorType::get(ScalarTy, VL.size());
@@ -8512,11 +8441,24 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
UsedScalars.set(I);
}
auto GetCastContextHint = [&](Value *V) {
- if (const TreeEntry *OpTE = getTreeEntry(V))
- return getCastContextHint(*OpTE);
- InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
- if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
- return TTI::CastContextHint::GatherScatter;
+ if (const TreeEntry *OpTE = getTreeEntry(V)) {
+ if (OpTE->State == TreeEntry::ScatterVectorize ||
+ OpTE->State == TreeEntry::StridedVectorize)
+ return TTI::CastContextHint::GatherScatter;
+ if (OpTE->State == TreeEntry::Vectorize &&
+ OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
+ if (OpTE->ReorderIndices.empty())
+ return TTI::CastContextHint::Normal;
+ SmallVector<int> Mask;
+ inversePermutation(OpTE->ReorderIndices, Mask);
+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+ return TTI::CastContextHint::Reversed;
+ }
+ } else {
+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+ return TTI::CastContextHint::GatherScatter;
+ }
return TTI::CastContextHint::None;
};
auto GetCostDiff =
@@ -8565,6 +8507,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI::CastContextHint CCH = GetCastContextHint(VL0);
VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
CostKind);
+ ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
+ ScalarTy, CCH, CostKind);
}
}
}
@@ -8581,7 +8525,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
- *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
+ *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
"Calculated GEPs cost for Tree"));
@@ -8628,7 +8572,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
NumElts = ATy->getNumElements();
else
NumElts = AggregateTy->getStructNumElements();
- SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
+ SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
}
if (I->hasOneUse()) {
Instruction *Ext = I->user_back();
@@ -8796,7 +8740,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
}
auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
- auto *VI = cast<Instruction>(UniqueValues[Idx]);
+ // Do not count cost here if minimum bitwidth is in effect and it is just
+ // a bitcast (here it is just a noop).
+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
+ return TTI::TCC_Free;
+ auto *VI = VL0->getOpcode() == Opcode
+ ? cast<Instruction>(UniqueValues[Idx])
+ : nullptr;
return TTI->getCastInstrCost(Opcode, VL0->getType(),
VL0->getOperand(0)->getType(),
TTI::getCastContextHint(VI), CostKind, VI);
@@ -8839,7 +8789,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
- return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
+ return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
Builder.getInt1Ty(), CurrentPred, CostKind,
VI);
};
@@ -8894,7 +8844,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI::OperandValueInfo Op2Info =
TTI::getOperandInfo(VI->getOperand(OpIdx));
SmallVector<const Value *> Operands(VI->operand_values());
- return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
+ return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
Op1Info, Op2Info, Operands, VI);
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
@@ -8913,9 +8863,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Load: {
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
- return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
- VI->getAlign(), VI->getPointerAddressSpace(),
- CostKind, TTI::OperandValueInfo(), VI);
+ return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
+ VI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), VI);
};
auto *LI0 = cast<LoadInst>(VL0);
auto GetVectorCost = [&](InstructionCost CommonCost) {
@@ -8958,9 +8908,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<StoreInst>(VL[Idx]);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
- return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
- VI->getAlign(), VI->getPointerAddressSpace(),
- CostKind, OpInfo, VI);
+ return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
+ VI->getPointerAddressSpace(), CostKind,
+ OpInfo, VI);
};
auto *BaseSI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
@@ -9822,44 +9772,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost -= InsertCost;
}
- // Add the cost for reduced value resize (if required).
- if (ReductionBitWidth != 0) {
- assert(UserIgnoreList && "Expected reduction tree.");
- const TreeEntry &E = *VectorizableTree.front().get();
- auto It = MinBWs.find(&E);
- if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
- unsigned SrcSize = It->second.first;
- unsigned DstSize = ReductionBitWidth;
- unsigned Opcode = Instruction::Trunc;
- if (SrcSize < DstSize)
- Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
- auto *SrcVecTy =
- FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
- auto *DstVecTy =
- FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
- TTI::CastContextHint CCH = getCastContextHint(E);
- InstructionCost CastCost;
- switch (E.getOpcode()) {
- case Instruction::SExt:
- case Instruction::ZExt:
- case Instruction::Trunc: {
- const TreeEntry *OpTE = getOperandEntry(&E, 0);
- CCH = getCastContextHint(*OpTE);
- break;
- }
- default:
- break;
- }
- CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
- TTI::TCK_RecipThroughput);
- Cost += CastCost;
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
- << " for final resize for reduction from " << SrcVecTy
- << " to " << DstVecTy << "\n";
- dbgs() << "SLP: Current total cost = " << Cost << "\n");
- }
- }
-
#ifndef NDEBUG
SmallString<256> Str;
{
@@ -10080,30 +9992,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// tree node for each gathered value - we have just a permutation of the
// single vector. If we have 2 different sets, we're in situation where we
// have a permutation of 2 input vectors.
- // Filter out entries with larger bitwidth of elements.
- Type *ScalarTy = VL.front()->getType();
- unsigned BitWidth = 0;
- if (ScalarTy->isIntegerTy()) {
- // Check if the used TEs supposed to be resized and choose the best
- // candidates.
- BitWidth = DL->getTypeStoreSize(ScalarTy);
- if (TEUseEI.UserTE->getOpcode() != Instruction::Select ||
- TEUseEI.EdgeIdx != 0) {
- auto UserIt = MinBWs.find(TEUseEI.UserTE);
- if (UserIt != MinBWs.end())
- BitWidth = UserIt->second.second;
- }
- }
- auto CheckBitwidth = [&](const TreeEntry &TE) {
- Type *ScalarTy = TE.Scalars.front()->getType();
- if (!ScalarTy->isIntegerTy())
- return true;
- unsigned TEBitWidth = DL->getTypeStoreSize(ScalarTy);
- auto UserIt = MinBWs.find(TEUseEI.UserTE);
- if (UserIt != MinBWs.end())
- TEBitWidth = UserIt->second.second;
- return BitWidth == TEBitWidth;
- };
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
for (Value *V : VL) {
@@ -10138,8 +10026,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
continue;
}
- if (!CheckBitwidth(*TEPtr))
- continue;
// Check if the user node of the TE comes after user node of TEPtr,
// otherwise TEPtr depends on TE.
if ((TEInsertBlock != InsertPt->getParent() ||
@@ -10156,8 +10042,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
continue;
VTE = *It->getSecond().begin();
// Iterate through all vectorized nodes.
- auto *MIt = find_if(It->getSecond(), [&](const TreeEntry *MTE) {
- return MTE->State == TreeEntry::Vectorize && CheckBitwidth(*MTE);
+ auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+ return MTE->State == TreeEntry::Vectorize;
});
if (MIt == It->getSecond().end())
continue;
@@ -10167,7 +10053,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
- if (!CheckBitwidth(*VTE))
+ auto It = MinBWs.find(VTE);
+ // If vectorize node is demoted - do not match.
+ if (It != MinBWs.end() &&
+ It->second.first != DL->getTypeSizeInBits(V->getType()))
continue;
VToTEs.insert(VTE);
}
@@ -13040,21 +12929,7 @@ Value *BoUpSLP::vectorizeTree(
Builder.ClearInsertionPoint();
InstrElementSize.clear();
- const TreeEntry &RootTE = *VectorizableTree.front().get();
- Value *Vec = RootTE.VectorizedValue;
- if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
- It != MinBWs.end() &&
- ReductionBitWidth != It->second.first) {
- IRBuilder<>::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(ReductionRoot->getParent(),
- ReductionRoot->getIterator());
- Vec = Builder.CreateIntCast(
- Vec,
- VectorType::get(Builder.getIntNTy(ReductionBitWidth),
- cast<VectorType>(Vec->getType())->getElementCount()),
- It->second.second);
- }
- return Vec;
+ return VectorizableTree[0]->VectorizedValue;
}
void BoUpSLP::optimizeGatherSequence() {
@@ -13874,42 +13749,23 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
bool BoUpSLP::collectValuesToDemote(
- Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
- SmallVectorImpl<Value *> &ToDemote,
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
- bool &IsProfitableToDemote) const {
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
// We can always demote constants.
- if (isa<Constant>(V)) {
- MaxDepthLevel = 1;
+ if (isa<Constant>(V))
return true;
- }
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
- // TODO: improve handling of gathered values and others.
auto *I = dyn_cast<Instruction>(V);
- if (!I || !Visited.insert(I).second || !getTreeEntry(I) ||
- MultiNodeScalars.contains(I) || all_of(I->users(), [&](User *U) {
+ if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
+ !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
return isa<InsertElementInst>(U) && !getTreeEntry(U);
}))
return false;
- auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
- if (MultiNodeScalars.contains(V))
- return false;
- uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
- APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
- if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
- return true;
- auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
- unsigned BitWidth1 = OrigBitWidth - NumSignBits;
- if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
- ++BitWidth1;
- BitWidth = std::max(BitWidth, BitWidth1);
- return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
- };
unsigned Start = 0;
unsigned End = I->getNumOperands();
switch (I->getOpcode()) {
@@ -13917,14 +13773,12 @@ bool BoUpSLP::collectValuesToDemote(
// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.
case Instruction::Trunc:
- MaxDepthLevel = 1;
- if (IsProfitableToDemoteRoot)
- IsProfitableToDemote = true;
+ Roots.push_back(I->getOperand(0));
break;
case Instruction::ZExt:
case Instruction::SExt:
- MaxDepthLevel = 1;
- IsProfitableToDemote = true;
+ if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
+ return false;
break;
// We can demote certain binary operations if we can demote both of their
@@ -13934,32 +13788,23 @@ bool BoUpSLP::collectValuesToDemote(
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
- unsigned Level1, Level2;
- if (!collectValuesToDemote(I->getOperand(0), IsProfitableToDemoteRoot,
- BitWidth, ToDemote, DemotedConsts, Visited,
- Level1, IsProfitableToDemote) ||
- !collectValuesToDemote(I->getOperand(1), IsProfitableToDemoteRoot,
- BitWidth, ToDemote, DemotedConsts, Visited,
- Level2, IsProfitableToDemote))
+ case Instruction::Xor:
+ if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
+ Visited) ||
+ !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
- MaxDepthLevel = std::max(Level1, Level2);
break;
- }
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
Start = 1;
- unsigned Level1, Level2;
SelectInst *SI = cast<SelectInst>(I);
- if (!collectValuesToDemote(SI->getTrueValue(), IsProfitableToDemoteRoot,
- BitWidth, ToDemote, DemotedConsts, Visited,
- Level1, IsProfitableToDemote) ||
- !collectValuesToDemote(SI->getFalseValue(), IsProfitableToDemoteRoot,
- BitWidth, ToDemote, DemotedConsts, Visited,
- Level2, IsProfitableToDemote))
+ if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
+ Roots, Visited) ||
+ !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
+ Roots, Visited))
return false;
- MaxDepthLevel = std::max(Level1, Level2);
break;
}
@@ -13968,262 +13813,171 @@ bool BoUpSLP::collectValuesToDemote(
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
- if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
- ToDemote, DemotedConsts, Visited,
- MaxDepthLevel, IsProfitableToDemote))
+ if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
break;
}
// Otherwise, conservatively give up.
default:
- return IsProfitableToDemote && IsPotentiallyTruncated(I, BitWidth);
+ return false;
}
- ++MaxDepthLevel;
// Gather demoted constant operands.
for (unsigned Idx : seq<unsigned>(Start, End))
if (isa<Constant>(I->getOperand(Idx)))
DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
// Record the value that we can demote.
ToDemote.push_back(V);
- return IsProfitableToDemote;
+ return true;
}
void BoUpSLP::computeMinimumValueSizes() {
// We only attempt to truncate integer expressions.
- bool IsStoreOrInsertElt =
- VectorizableTree.front()->getOpcode() == Instruction::Store ||
- VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
- if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
- (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
- CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
+ auto &TreeRoot = VectorizableTree[0]->Scalars;
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+ if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
return;
- unsigned NodeIdx = 0;
- if (IsStoreOrInsertElt &&
- VectorizableTree.front()->State != TreeEntry::NeedToGather)
- NodeIdx = 1;
-
// Ensure the roots of the vectorizable tree don't form a cycle.
- if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
- (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
- (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
- [NodeIdx](const EdgeInfo &EI) {
- return EI.UserTE->Idx >
- static_cast<int>(NodeIdx);
- })))
- return;
-
- // The first value node for store/insertelement is sext/zext/trunc? Skip it,
- // resize to the final type.
- bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
- if (NodeIdx != 0 &&
- VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
- (VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt ||
- VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt ||
- VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) {
- assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
- ++NodeIdx;
- IsProfitableToDemoteRoot = true;
- }
-
- // Analyzed in reduction already and not profitable - exit.
- if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
+ if (!VectorizableTree.front()->UserTreeIndices.empty())
return;
- SmallVector<Value *> ToDemote;
+ // Conservatively determine if we can actually truncate the roots of the
+ // expression. Collect the values that can be demoted in ToDemote and
+ // additional roots that require investigating in Roots.
+ SmallVector<Value *, 32> ToDemote;
DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
- auto ComputeMaxBitWidth = [&](ArrayRef<Value *> TreeRoot, unsigned VF,
- bool IsTopRoot, bool IsProfitableToDemoteRoot,
- unsigned Opcode, unsigned Limit) {
- ToDemote.clear();
- auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
- if (!TreeRootIT || !Opcode)
- return 0u;
-
- if (AnalyzedMinBWVals.contains(TreeRoot.front()))
- return 0u;
-
- unsigned NumParts = TTI->getNumberOfParts(
- FixedVectorType::get(TreeRoot.front()->getType(), VF));
-
- // The maximum bit width required to represent all the values that can be
- // demoted without loss of precision. It would be safe to truncate the roots
- // of the expression to this width.
- unsigned MaxBitWidth = 1u;
-
- // True if the roots can be zero-extended back to their original type,
- // rather than sign-extended. We know that if the leading bits are not
- // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
- // True.
+ SmallVector<Value *, 4> Roots;
+ for (auto *Root : TreeRoot) {
+ DenseSet<Value *> Visited;
+ if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
+ return;
+ }
+
+ // The maximum bit width required to represent all the values that can be
+ // demoted without loss of precision. It would be safe to truncate the roots
+ // of the expression to this width.
+ auto MaxBitWidth = 1u;
+
+ // We first check if all the bits of the roots are demanded. If they're not,
+ // we can truncate the roots to this narrower type.
+ for (auto *Root : TreeRoot) {
+ auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
+ MaxBitWidth);
+ }
+
+ // True if the roots can be zero-extended back to their original type, rather
+ // than sign-extended. We know that if the leading bits are not demanded, we
+ // can safely zero-extend. So we initialize IsKnownPositive to True.
+ bool IsKnownPositive = true;
+
+ // If all the bits of the roots are demanded, we can try a little harder to
+ // compute a narrower type. This can happen, for example, if the roots are
+ // getelementptr indices. InstCombine promotes these indices to the pointer
+ // width. Thus, all their bits are technically demanded even though the
+ // address computation might be vectorized in a smaller type.
+ //
+ // We start by looking at each entry that can be demoted. We compute the
+ // maximum bit width required to store the scalar by using ValueTracking to
+ // compute the number of high-order bits we can truncate.
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ all_of(TreeRoot, [](Value *V) {
+ return all_of(V->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); });
+ })) {
+ MaxBitWidth = 8u;
+
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
- bool IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+ IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
KnownBits Known = computeKnownBits(R, *DL);
return Known.isNonNegative();
});
- // We first check if all the bits of the roots are demanded. If they're not,
- // we can truncate the roots to this narrower type.
- for (auto *Root : TreeRoot) {
- unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
- TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
- unsigned BitWidth1 = NumTypeBits - NumSignBits;
- // If we can't prove that the sign bit is zero, we must add one to the
- // maximum bit width to account for the unknown sign bit. This preserves
- // the existing sign bit so we can safely sign-extend the root back to the
- // original type. Otherwise, if we know the sign bit is zero, we will
- // zero-extend the root instead.
- //
- // FIXME: This is somewhat suboptimal, as there will be cases where adding
- // one to the maximum bit width will yield a larger-than-necessary
- // type. In general, we need to add an extra bit only if we can't
- // prove that the upper bit of the original type is equal to the
- // upper bit of the proposed smaller type. If these two bits are
- // the same (either zero or one) we know that sign-extending from
- // the smaller type will result in the same value. Here, since we
- // can't yet prove this, we are just making the proposed smaller
- // type larger to ensure correctness.
- if (!IsKnownPositive)
- ++BitWidth1;
-
- APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
- unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
- MaxBitWidth =
- std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
- }
-
- if (MaxBitWidth < 8 && MaxBitWidth > 1)
- MaxBitWidth = 8;
-
- // If the original type is large, but reduced type does not improve the reg
- // use - ignore it.
- if (NumParts > 1 &&
- NumParts ==
- TTI->getNumberOfParts(FixedVectorType::get(
- IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
- return 0u;
-
- bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
- Opcode == Instruction::SExt ||
- Opcode == Instruction::ZExt || NumParts > 1;
- // Conservatively determine if we can actually truncate the roots of the
- // expression. Collect the values that can be demoted in ToDemote and
- // additional roots that require investigating in Roots.
- for (auto *Root : TreeRoot) {
- DenseSet<Value *> Visited;
- unsigned MaxDepthLevel;
- bool NeedToDemote = IsProfitableToDemote;
-
- if (!collectValuesToDemote(Root, IsProfitableToDemoteRoot, MaxBitWidth,
- ToDemote, DemotedConsts, Visited,
- MaxDepthLevel, NeedToDemote) ||
- (MaxDepthLevel <= Limit &&
- !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
- (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
- DL->getTypeSizeInBits(Root->getType()) /
- DL->getTypeSizeInBits(
- cast<Instruction>(Root)->getOperand(0)->getType()) >
- 2)) ||
- (Opcode == Instruction::Trunc &&
- (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
- DL->getTypeSizeInBits(
- cast<Instruction>(Root)->getOperand(0)->getType()) /
- DL->getTypeSizeInBits(Root->getType()) >
- 2)))))
- return 0u;
- }
- // Round MaxBitWidth up to the next power-of-two.
- MaxBitWidth = bit_ceil(MaxBitWidth);
-
- return MaxBitWidth;
- };
+ // Determine the maximum number of bits required to store the scalar
+ // values.
+ for (auto *Scalar : ToDemote) {
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
+ auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+ MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
+ }
+
+ // If we can't prove that the sign bit is zero, we must add one to the
+ // maximum bit width to account for the unknown sign bit. This preserves
+ // the existing sign bit so we can safely sign-extend the root back to the
+ // original type. Otherwise, if we know the sign bit is zero, we will
+ // zero-extend the root instead.
+ //
+ // FIXME: This is somewhat suboptimal, as there will be cases where adding
+ // one to the maximum bit width will yield a larger-than-necessary
+ // type. In general, we need to add an extra bit only if we can't
+ // prove that the upper bit of the original type is equal to the
+ // upper bit of the proposed smaller type. If these two bits are the
+ // same (either zero or one) we know that sign-extending from the
+ // smaller type will result in the same value. Here, since we can't
+ // yet prove this, we are just making the proposed smaller type
+ // larger to ensure correctness.
+ if (!IsKnownPositive)
+ ++MaxBitWidth;
+ }
+
+ // Round MaxBitWidth up to the next power-of-two.
+ MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
+
+ // If the maximum bit width we compute is less than the with of the roots'
+ // type, we can proceed with the narrowing. Otherwise, do nothing.
+ if (MaxBitWidth >= TreeRootIT->getBitWidth())
+ return;
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
- // Add reduction ops sizes, if any.
- if (UserIgnoreList &&
- isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
- for (Value *V : *UserIgnoreList) {
- auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
- auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
- unsigned BitWidth1 = NumTypeBits - NumSignBits;
- if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
- ++BitWidth1;
- auto Mask = DB->getDemandedBits(cast<Instruction>(V));
- unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
- ReductionBitWidth =
- std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
- }
- if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
- ReductionBitWidth = 8;
-
- ReductionBitWidth = bit_ceil(ReductionBitWidth);
- }
- bool IsTopRoot = NodeIdx == 0;
- while (NodeIdx < VectorizableTree.size() &&
- VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
- VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)
- ++NodeIdx;
- while (NodeIdx < VectorizableTree.size()) {
- ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
- unsigned Limit = 2;
- unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
- if (IsTopRoot &&
- ReductionBitWidth ==
- DL->getTypeSizeInBits(
- VectorizableTree.front()->Scalars.front()->getType()))
- Limit = 3;
- unsigned MaxBitWidth = ComputeMaxBitWidth(
- TreeRoot, VectorizableTree[NodeIdx]->getVectorFactor(), IsTopRoot,
- IsProfitableToDemoteRoot, Opcode, Limit);
- IsTopRoot = false;
- IsProfitableToDemoteRoot = true;
-
- if (TruncNodes.empty()) {
- NodeIdx = VectorizableTree.size();
- } else {
- NodeIdx = *TruncNodes.begin() + 1;
- TruncNodes.erase(TruncNodes.begin());
- }
-
- // If the maximum bit width we compute is less than the with of the roots'
- // type, we can proceed with the narrowing. Otherwise, do nothing.
- if (MaxBitWidth == 0 ||
- MaxBitWidth >=
- cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
- if (UserIgnoreList)
- AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
+ while (!Roots.empty()) {
+ DenseSet<Value *> Visited;
+ collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
+ Visited);
+ }
+
+ // Check that all users are marked for demotion.
+ DenseSet<Value *> Demoted(ToDemote.begin(), ToDemote.end());
+ DenseSet<const TreeEntry *> Visited;
+ for (Value *V: ToDemote) {
+ const TreeEntry *TE = getTreeEntry(V);
+ assert(TE && "Expected vectorized scalar.");
+ if (!Visited.insert(TE).second)
continue;
- }
-
- // Finally, map the values we can demote to the maximum bit with we
- // computed.
- for (Value *Scalar : ToDemote) {
- TreeEntry *TE = getTreeEntry(Scalar);
- assert(TE && "Expected vectorized scalar.");
- if (MinBWs.contains(TE))
- continue;
- bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
- return !isKnownNonNegative(R, SimplifyQuery(*DL));
- });
- MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
- const auto *I = cast<Instruction>(Scalar);
- auto DCIt = DemotedConsts.find(I);
- if (DCIt != DemotedConsts.end()) {
- for (unsigned Idx : DCIt->getSecond()) {
- // Check that all instructions operands are demoted.
+ if (!all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return all_of(EI.UserTE->Scalars,
+ [&](Value *V) { return Demoted.contains(V); });
+ }))
+ return;
+ }
+ // Finally, map the values we can demote to the maximum bit with we computed.
+ for (auto *Scalar : ToDemote) {
+ auto *TE = getTreeEntry(Scalar);
+ assert(TE && "Expected vectorized scalar.");
+ if (MinBWs.contains(TE))
+ continue;
+ bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
+ KnownBits Known = computeKnownBits(R, *DL);
+ return !Known.isNonNegative();
+ });
+ MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
+ const auto *I = cast<Instruction>(Scalar);
+ auto DCIt = DemotedConsts.find(I);
+ if (DCIt != DemotedConsts.end()) {
+ for (unsigned Idx : DCIt->getSecond()) {
+ // Check that all instructions operands are demoted.
+ if (all_of(TE->Scalars, [&](Value *V) {
+ auto SIt = DemotedConsts.find(cast<Instruction>(V));
+ return SIt != DemotedConsts.end() &&
+ is_contained(SIt->getSecond(), Idx);
+ })) {
const TreeEntry *CTE = getOperandEntry(TE, Idx);
- if (all_of(TE->Scalars,
- [&](Value *V) {
- auto SIt = DemotedConsts.find(cast<Instruction>(V));
- return SIt != DemotedConsts.end() &&
- is_contained(SIt->getSecond(), Idx);
- }) ||
- all_of(CTE->Scalars, Constant::classof))
- MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
+ MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
}
}
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 5e3fd15..cef7916 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -17,13 +17,12 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 1cce520..47485e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
index a7a7f64..d67fdc1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -28,11 +28,21 @@ entry:
define i64 @red_zext_ld_4xi64(ptr %ptr) {
; CHECK-LABEL: @red_zext_ld_4xi64(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
-; CHECK-NEXT: ret i64 [[TMP3]]
+; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
+; CHECK-NEXT: [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
+; CHECK-NEXT: [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
+; CHECK-NEXT: ret i64 [[ADD_3]]
;
entry:
%ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 500f106..000e7a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -802,10 +802,9 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) {
; CHECK-LABEL: @red_zext_ld_4xi64(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
-; CHECK-NEXT: ret i64 [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT: ret i64 [[TMP2]]
;
entry:
%ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
index 05511f8..4565d49 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -15,12 +15,11 @@ define { i64, i64 } @patatino(double %arg) {
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1
; CHECK-NEXT: ret { i64, i64 } [[T17]]
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
index 5ee8016..a0af8e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s
define void @t(i64 %v) {
; CHECK-LABEL: define void @t(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
index 6051638..6e512fc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
@@ -6,17 +6,18 @@ define void @test(i8 %0) {
; CHECK-SAME: i8 [[TMP0:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> <i8 0, i8 poison>, i8 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
-; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]]
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 1
; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SHR]] to i8
; CHECK-NEXT: store i8 [[CONV9]], ptr null, align 1
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 4acd630..2c83461 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -6,20 +6,15 @@ define void @test(i64 %d.promoted.i) {
; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I]], i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1>
-; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
-; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]])
-; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0
-; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> [[TMP0]], i64 [[AND_1_I]], i32 9
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 0
+; CHECK-NEXT: store i32 [[TMP6]], ptr null, align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index a316415..651631d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -17,15 +17,12 @@ target triple = "x86_64-unknown-linux-gnu"
define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
; SSE-LABEL: @PR31243_zext(
; SSE-NEXT: entry:
-; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -76,15 +73,12 @@ entry:
define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
; SSE-LABEL: @PR31243_sext(
; SSE-NEXT: entry:
-; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
+; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
+; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -95,12 +89,13 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
+; AVX-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
+; AVX-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
; AVX-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
; AVX-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
; AVX-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
index 3cc32c1..88f75c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
@@ -15,8 +15,8 @@ define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 undef, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
@@ -52,8 +52,8 @@ define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
@@ -89,8 +89,8 @@ define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 0, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
@@ -127,8 +127,8 @@ define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %ar
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
@@ -165,8 +165,8 @@ define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %a
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 poison, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
@@ -202,8 +202,8 @@ define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index b7237cb..78c6d95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -11,26 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
; CHECK: if.then22.i:
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], <i32 1, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
-; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_6_I_I]], i32 7
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i8> [[TMP13]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr undef, align 1
; CHECK-NEXT: unreachable
; CHECK: if.end50.i:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
index 1d1fcec..5d22b5a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
@@ -7,10 +7,12 @@ define i1 @test(i1 %cmp5.not.31) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> <i1 poison, i1 false, i1 false, i1 false>, i1 [[CMP5_NOT_31]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], <i32 2, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 0
-; CHECK-NEXT: [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], <i32 2, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 0
+; CHECK-NEXT: [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: ret i1 [[CMP_NOT_I_I]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
index 2f6868d..c1dd90d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
@@ -8,18 +8,17 @@
; YAML-NEXT: Function: stores
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
-; YAML-NEXT: - Cost: '-7'
+; YAML-NEXT: - Cost: '-3'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '6'
define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
; CHECK-LABEL: @stores(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
-; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
%load.1 = load i8, ptr %in, align 1
@@ -64,18 +63,17 @@ define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
; YAML-NEXT: Function: insertelems
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
-; YAML-NEXT: - Cost: '-9'
+; YAML-NEXT: - Cost: '-5'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '6'
define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) {
; CHECK-LABEL: @insertelems(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
-; CHECK-NEXT: ret <4 x i64> [[TMP6]]
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: ret <4 x i64> [[TMP5]]
;
%load.1 = load i8, ptr %in, align 1
%gep.1 = getelementptr inbounds i8, ptr %in, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
index ff6f0bdd..061fbdb 100644
--- a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
@@ -10,8 +10,8 @@ define i32 @alt_cmp(i16 %call46) {
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i16> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP5]])
; CHECK-NEXT: [[OP_RDX:%.*]] = or i16 [[TMP6]], 0
; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[OP_RDX]] to i32
; CHECK-NEXT: ret i32 [[EXT]]