aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp446
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h3
2 files changed, 360 insertions, 89 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4e33474..de4e56f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2422,18 +2422,25 @@ private:
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
/// permutations. Must form single-register vector.
+ /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+ /// commands to build the mask using the original vector value, without
+ /// relying on the potential reordering.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
+ bool ForOrder);
/// Checks if the gathered \p VL can be represented as multi-register
/// shuffle(s) of previous tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
/// permutations.
+ /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+ /// commands to build the mask using the original vector value, without
+ /// relying on the potential reordering.
/// \returns per-register series of ShuffleKind, if gathered values can be
/// represented as shuffles of previous tree entries. \p Mask is filled with
/// the shuffle mask (also on per-register base).
@@ -2441,7 +2448,7 @@ private:
isGatherShuffledEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts);
+ unsigned NumParts, bool ForOrder = false);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
std::optional<BoUpSLP::OrdersType>
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
- unsigned NumScalars = TE.Scalars.size();
+ // Try to find subvector extract/insert patterns and reorder only such
+ // patterns.
+ SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ int NumScalars = GatheredScalars.size();
+ if (!isValidElementType(ScalarTy))
+ return std::nullopt;
+ auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
+ int NumParts = TTI->getNumberOfParts(VecTy);
+ if (NumParts == 0 || NumParts >= NumScalars)
+ NumParts = 1;
+ SmallVector<int> ExtractMask;
+ SmallVector<int> Mask;
+ SmallVector<SmallVector<const TreeEntry *>> Entries;
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
+ isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
+ /*ForOrder=*/true);
+ // No shuffled operands - ignore.
+ if (GatherShuffles.empty() && ExtractShuffles.empty())
+ return std::nullopt;
OrdersType CurrentOrder(NumScalars, NumScalars);
- SmallVector<int> Positions;
- SmallBitVector UsedPositions(NumScalars);
- const TreeEntry *STE = nullptr;
- // Try to find all gathered scalars that are gets vectorized in other
- // vectorize node. Here we can have only one single tree vector node to
- // correctly identify order of the gathered scalars.
- for (unsigned I = 0; I < NumScalars; ++I) {
- Value *V = TE.Scalars[I];
- if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
- continue;
- if (const auto *LocalSTE = getTreeEntry(V)) {
- if (!STE)
- STE = LocalSTE;
- else if (STE != LocalSTE)
- // Take the order only from the single vector node.
- return std::nullopt;
- unsigned Lane =
- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
- if (Lane >= NumScalars)
- return std::nullopt;
- if (CurrentOrder[Lane] != NumScalars) {
- if (Lane != I)
+ if (GatherShuffles.size() == 1 &&
+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+ Entries.front().front()->isSame(TE.Scalars)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
+ return CurrentOrder;
+ }
+ auto IsSplatMask = [](ArrayRef<int> Mask) {
+ int SingleElt = PoisonMaskElem;
+ return all_of(Mask, [&](int I) {
+ if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
+ SingleElt = I;
+ return I == PoisonMaskElem || I == SingleElt;
+ });
+ };
+ // Exclusive broadcast mask - ignore.
+ if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
+ (Entries.size() != 1 ||
+ Entries.front().front()->ReorderIndices.empty())) ||
+ (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
+ return std::nullopt;
+ SmallBitVector ShuffledSubMasks(NumParts);
+ auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
+ ArrayRef<int> Mask, int PartSz, int NumParts,
+ function_ref<unsigned(unsigned)> GetVF) {
+ for (int I : seq<int>(0, NumParts)) {
+ if (ShuffledSubMasks.test(I))
+ continue;
+ const int VF = GetVF(I);
+ if (VF == 0)
+ continue;
+ MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+ // Shuffle of at least 2 vectors - ignore.
+ if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
+ continue;
+ }
+ // Try to include as much elements from the mask as possible.
+ int FirstMin = INT_MAX;
+ int SecondVecFound = false;
+ for (int K : seq<int>(0, PartSz)) {
+ int Idx = Mask[I * PartSz + K];
+ if (Idx == PoisonMaskElem) {
+ Value *V = GatheredScalars[I * PartSz + K];
+ if (isConstant(V) && !isa<PoisonValue>(V)) {
+ SecondVecFound = true;
+ break;
+ }
continue;
- UsedPositions.reset(CurrentOrder[Lane]);
+ }
+ if (Idx < VF) {
+ if (FirstMin > Idx)
+ FirstMin = Idx;
+ } else {
+ SecondVecFound = true;
+ break;
+ }
}
- // The partial identity (where only some elements of the gather node are
- // in the identity order) is good.
- CurrentOrder[Lane] = I;
- UsedPositions.set(I);
- }
- }
- // Need to keep the order if we have a vector entry and at least 2 scalars or
- // the vectorized entry has just 2 scalars.
- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
- for (unsigned I = 0; I < NumScalars; ++I)
- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
- return false;
- return true;
- };
- if (IsIdentityOrder(CurrentOrder))
- return OrdersType();
- auto *It = CurrentOrder.begin();
- for (unsigned I = 0; I < NumScalars;) {
- if (UsedPositions.test(I)) {
- ++I;
+ FirstMin = (FirstMin / PartSz) * PartSz;
+ // Shuffle of at least 2 vectors - ignore.
+ if (SecondVecFound) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
continue;
}
- if (*It == NumScalars) {
- *It = I;
- ++I;
+ for (int K : seq<int>(0, PartSz)) {
+ int Idx = Mask[I * PartSz + K];
+ if (Idx == PoisonMaskElem)
+ continue;
+ Idx -= FirstMin;
+ if (Idx >= PartSz) {
+ SecondVecFound = true;
+ break;
+ }
+ if (CurrentOrder[I * PartSz + Idx] >
+ static_cast<unsigned>(I * PartSz + K) &&
+ CurrentOrder[I * PartSz + Idx] !=
+ static_cast<unsigned>(I * PartSz + Idx))
+ CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
+ }
+ // Shuffle of at least 2 vectors - ignore.
+ if (SecondVecFound) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
+ continue;
}
- ++It;
}
- return std::move(CurrentOrder);
+ };
+ int PartSz = NumScalars / NumParts;
+ if (!ExtractShuffles.empty())
+ TransformMaskToOrder(
+ CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
+ if (!ExtractShuffles[I])
+ return 0U;
+ unsigned VF = 0;
+ for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+ int K = I * PartSz + Idx;
+ if (ExtractMask[K] == PoisonMaskElem)
+ continue;
+ if (!TE.ReuseShuffleIndices.empty())
+ K = TE.ReuseShuffleIndices[K];
+ if (!TE.ReorderIndices.empty())
+ K = std::distance(TE.ReorderIndices.begin(),
+ find(TE.ReorderIndices, K));
+ auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
+ if (!EI)
+ continue;
+ VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
+ ->getElementCount()
+ .getKnownMinValue());
+ }
+ return VF;
+ });
+ // Check special corner case - single shuffle of the same entry.
+ if (GatherShuffles.size() == 1 && NumParts != 1) {
+ if (ShuffledSubMasks.any())
+ return std::nullopt;
+ PartSz = NumScalars;
+ NumParts = 1;
}
- return std::nullopt;
+ if (!Entries.empty())
+ TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
+ if (!GatherShuffles[I])
+ return 0U;
+ return std::max(Entries[I].front()->getVectorFactor(),
+ Entries[I].back()->getVectorFactor());
+ });
+ int NumUndefs =
+ count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
+ if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
+ return std::nullopt;
+ return std::move(CurrentOrder);
}
namespace {
@@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
// element 3 is used twice in the second submask.
unsigned Sz = TE.Scalars.size();
- if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
- Sz))
+ if (TE.State == TreeEntry::NeedToGather) {
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE)) {
+ SmallVector<int> Mask;
+ fixupOrderingIndices(*CurrentOrder);
+ inversePermutation(*CurrentOrder, Mask);
+ ::addMask(Mask, TE.ReuseShuffleIndices);
+ OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
+ unsigned Sz = TE.Scalars.size();
+ for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
+ for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
+ if (Idx != PoisonMaskElem)
+ Res[Idx + K * Sz] = I + K * Sz;
+ }
+ return std::move(Res);
+ }
+ }
+ if (Sz == 2 && TE.getVectorFactor() == 4 &&
+ TTI->getNumberOfParts(FixedVectorType::get(
+ TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
return std::nullopt;
+ if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+ Sz)) {
+ SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
+ if (TE.ReorderIndices.empty())
+ std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
+ else
+ inversePermutation(TE.ReorderIndices, ReorderMask);
+ ::addMask(ReorderMask, TE.ReuseShuffleIndices);
+ unsigned VF = ReorderMask.size();
+ OrdersType ResOrder(VF, VF);
+ unsigned NumParts = VF / Sz;
+ SmallBitVector UsedVals(NumParts);
+ for (unsigned I = 0; I < VF; I += Sz) {
+ int Val = PoisonMaskElem;
+ unsigned UndefCnt = 0;
+ if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+ [&](int Idx) {
+ if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
+ Val = Idx;
+ if (Idx == PoisonMaskElem)
+ ++UndefCnt;
+ return Idx != PoisonMaskElem && Idx != Val;
+ }) ||
+ Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
+ UndefCnt > Sz / 2)
+ return std::nullopt;
+ UsedVals.set(Val);
+ for (unsigned K = 0; K < NumParts; ++K)
+ ResOrder[Val + Sz * K] = I + K;
+ }
+ return std::move(ResOrder);
+ }
unsigned VF = TE.getVectorFactor();
// Try build correct order for extractelement instructions.
SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
@@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
std::advance(It, Sz);
}
- if (all_of(enumerate(ResOrder),
+ if (TE.State == TreeEntry::NeedToGather &&
+ all_of(enumerate(ResOrder),
[](const auto &Data) { return Data.index() == Data.value(); }))
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
@@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
/*ResizeAllowed=*/true);
- if (Reuse || !CurrentOrder.empty()) {
- if (!CurrentOrder.empty())
- fixupOrderingIndices(CurrentOrder);
+ if (Reuse || !CurrentOrder.empty())
return std::move(CurrentOrder);
- }
}
// If the gather node is <undef, v, .., poison> and
// insertelement poison, v, 0 [+ permute]
@@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
PoisonValue::get(Ty), *It);
- if (InsertFirstCost + PermuteCost < InsertIdxCost)
+ if (InsertFirstCost + PermuteCost < InsertIdxCost) {
+ OrdersType Order(Sz, Sz);
+ Order[Idx] = 0;
return std::move(Order);
+ }
}
}
if (isSplat(TE.Scalars))
@@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
std::iota(It, std::next(It, Sz), 0);
}
+static void combineOrders(MutableArrayRef<unsigned> Order,
+ ArrayRef<unsigned> SecondaryOrder) {
+ assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
+ "Expected same size of orders");
+ unsigned Sz = Order.size();
+ SmallBitVector UsedIndices(Sz);
+ for (unsigned Idx : seq<unsigned>(0, Sz)) {
+ if (Order[Idx] != Sz)
+ UsedIndices.set(Order[Idx]);
+ }
+ if (SecondaryOrder.empty()) {
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Order[Idx] == Sz && !UsedIndices.test(Idx))
+ Order[Idx] = Idx;
+ } else {
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
+ !UsedIndices.test(SecondaryOrder[Idx]))
+ Order[Idx] = SecondaryOrder[Idx];
+ }
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() {
}
if (OrdersUses.empty())
continue;
+ auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+ const unsigned Sz = Order.size();
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Idx != Order[Idx] && Order[Idx] != Sz)
+ return false;
+ return true;
+ };
// Choose the most used order.
- ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
- unsigned Cnt = OrdersUses.front().second;
- for (const auto &Pair : drop_begin(OrdersUses)) {
- if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ unsigned IdentityCnt = 0;
+ unsigned FilledIdentityCnt = 0;
+ OrdersType IdentityOrder(VF, VF);
+ for (auto &Pair : OrdersUses) {
+ if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+ if (!Pair.first.empty())
+ FilledIdentityCnt += Pair.second;
+ IdentityCnt += Pair.second;
+ combineOrders(IdentityOrder, Pair.first);
+ }
+ }
+ MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+ unsigned Cnt = IdentityCnt;
+ for (auto &Pair : OrdersUses) {
+ // Prefer identity order. But, if filled identity found (non-empty order)
+ // with same number of uses, as the new candidate order, we can choose
+ // this candidate order.
+ if (Cnt < Pair.second ||
+ (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
+ Cnt == Pair.second && !BestOrder.empty() &&
+ IsIdentityOrder(BestOrder))) {
+ combineOrders(Pair.first, BestOrder);
BestOrder = Pair.first;
Cnt = Pair.second;
+ } else {
+ combineOrders(BestOrder, Pair.first);
}
}
// Set order of the user node.
- if (BestOrder.empty())
+ if (IsIdentityOrder(BestOrder))
continue;
+ fixupOrderingIndices(BestOrder);
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
@@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands(
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SetVector<TreeEntry *> OrderedEntries;
- DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+ DenseSet<const TreeEntry *> GathersToOrders;
// Find all reorderable leaf nodes with the given VF.
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
@@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
- GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ GathersToOrders.insert(TE.get());
}
}
@@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
- GathersToOrders.count(TE))) ||
+ GathersToOrders.contains(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
!all_of(drop_begin(TE->UserTreeIndices),
[TE](const EdgeInfo &EI) {
@@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const auto Order = [&]() -> const OrdersType {
if (OpTE->State == TreeEntry::NeedToGather ||
!OpTE->ReuseShuffleIndices.empty())
- return GathersToOrders.find(OpTE)->second;
+ return getReorderingData(*OpTE, /*TopToBottom=*/false)
+ .value_or(OrdersType(1));
return OpTE->ReorderIndices;
}();
+ // The order is partially ordered, skip it in favor of fully non-ordered
+ // orders.
+ if (Order.size() == 1)
+ continue;
unsigned NumOps = count_if(
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
@@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
(IgnoreReorder && TE->Idx == 0))
return true;
if (TE->State == TreeEntry::NeedToGather) {
- auto It = GathersToOrders.find(TE);
- if (It != GathersToOrders.end())
- return !It->second.empty();
+ if (GathersToOrders.contains(TE))
+ return !getReorderingData(*TE, /*TopToBottom=*/false)
+ .value_or(OrdersType(1))
+ .empty();
return true;
}
return false;
@@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
++Res.first->second;
}
}
- // Choose the best order.
- ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
- unsigned Cnt = OrdersUses.front().second;
- for (const auto &Pair : drop_begin(OrdersUses)) {
- if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ if (OrdersUses.empty()) {
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
+ continue;
+ }
+ auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+ const unsigned Sz = Order.size();
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Idx != Order[Idx] && Order[Idx] != Sz)
+ return false;
+ return true;
+ };
+ // Choose the most used order.
+ unsigned IdentityCnt = 0;
+ unsigned VF = Data.second.front().second->getVectorFactor();
+ OrdersType IdentityOrder(VF, VF);
+ for (auto &Pair : OrdersUses) {
+ if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+ IdentityCnt += Pair.second;
+ combineOrders(IdentityOrder, Pair.first);
+ }
+ }
+ MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+ unsigned Cnt = IdentityCnt;
+ for (auto &Pair : OrdersUses) {
+ // Prefer identity order. But, if filled identity found (non-empty
+ // order) with same number of uses, as the new candidate order, we can
+ // choose this candidate order.
+ if (Cnt < Pair.second) {
+ combineOrders(Pair.first, BestOrder);
BestOrder = Pair.first;
Cnt = Pair.second;
+ } else {
+ combineOrders(BestOrder, Pair.first);
}
}
- // Set order of the user node (reordering of operands and user nodes).
- if (BestOrder.empty()) {
+ // Set order of the user node.
+ if (IsIdentityOrder(BestOrder)) {
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
OrderedEntries.remove(Op.second);
continue;
}
+ fixupOrderingIndices(BestOrder);
// Erase operands from OrderedEntries list and adjust their orders.
VisitedOps.clear();
SmallVector<int> Mask;
@@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
V1 = Constant::getNullValue(
FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ // Not identity/broadcast? Try to see if the original vector is better.
+ if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
+ CommonVF == CommonMask.size() &&
+ any_of(enumerate(CommonMask),
+ [](const auto &&P) {
+ return P.value() != PoisonMaskElem &&
+ static_cast<unsigned>(P.value()) != P.index();
+ }) &&
+ any_of(CommonMask,
+ [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
+ SmallVector<int> ReorderMask;
+ inversePermutation(E->ReorderIndices, ReorderMask);
+ ::addMask(CommonMask, ReorderMask);
+ }
} else if (V1 && P2.isNull()) {
// Shuffle single vector.
CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
std::optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
Entries.clear();
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
@@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
VToTEs.insert(TEPtr);
}
if (const TreeEntry *VTE = getTreeEntry(V)) {
+ if (ForOrder) {
+ if (VTE->State != TreeEntry::Vectorize) {
+ auto It = MultiNodeScalars.find(V);
+ if (It == MultiNodeScalars.end())
+ continue;
+ VTE = *It->getSecond().begin();
+ // Iterate through all vectorized nodes.
+ auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+ return MTE->State == TreeEntry::Vectorize;
+ });
+ if (MIt == It->getSecond().end())
+ continue;
+ VTE = *MIt;
+ }
+ }
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
@@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
unsigned Idx = Part * VL.size() + Pair.second;
- Mask[Idx] = Pair.first * VF +
- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ Mask[Idx] =
+ Pair.first * VF +
+ (ForOrder ? std::distance(
+ Entries[Pair.first]->Scalars.begin(),
+ find(Entries[Pair.first]->Scalars, VL[Pair.second]))
+ : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
}
switch (Entries.size()) {
@@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
BoUpSLP::isGatherShuffledEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts) {
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
+ bool ForOrder) {
assert(NumParts > 0 && NumParts < VL.size() &&
"Expected positive number of registers.");
Entries.clear();
@@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry(
ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
std::optional<TTI::ShuffleKind> SubRes =
- isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
+ ForOrder);
if (!SubRes)
SubEntries.clear();
Res.push_back(SubRes);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 240d4bd..a2a203c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -385,9 +385,6 @@ struct VPTransformState {
VPValue2ValueTy VPValue2Value;
- /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
- Value *CanonicalIV = nullptr;
-
/// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
InnerLoopVectorizer *ILV;