aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2026-02-05 10:42:08 -0500
committerAlexey Bataev <a.bataev@outlook.com>2026-02-05 11:16:08 -0800
commitfe754dff6d7ed3f7d69e5a97846d9320cb407977 (patch)
tree91d35dcc6eb38ac6a4bab326dbf8bd5872a89ecc /llvm/lib/Transforms/Vectorize
parent5326166866e3a1c2ecc260830d19b03054ff6418 (diff)
downloadllvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.tar.gz
llvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.tar.bz2
llvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.zip
[SLP]Remove LoadCombine workaround after handling of the copyables
LoadCombine pattern handling was added as a workaround for the cases, where the SLP vectorizer could not vectorize the code effectively. With the copyables support, it can handle it directly. Also, patch adds support for scalar loads[ + bswap] pattern for byte sized loads (+ reverse bytes for bswap) Recommit after revert in 6377c86d718232fe60c548dfd7ab439f7ff84df7 Reviewers: RKSimon, hiraditya Pull Request: https://github.com/llvm/llvm-project/pull/174205
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp244
1 files changed, 145 insertions, 99 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7575cfb25051..f89c22fafcf0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2100,12 +2100,16 @@ public:
VectorizableTree.front()->getVectorFactor());
}
- /// Returns the opcode of the root node, or 0, if the root node is gather.
+ /// Returns true if the tree results in one of the reduced bitcasts variants.
bool isReducedBitcastRoot() const {
return VectorizableTree.front()->hasState() &&
(VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
VectorizableTree.front()->CombinedOp ==
- TreeEntry::ReducedBitcastBSwap) &&
+ TreeEntry::ReducedBitcastBSwap ||
+ VectorizableTree.front()->CombinedOp ==
+ TreeEntry::ReducedBitcastLoads ||
+ VectorizableTree.front()->CombinedOp ==
+ TreeEntry::ReducedBitcastBSwapLoads) &&
VectorizableTree.front()->State == TreeEntry::Vectorize;
}
@@ -2270,23 +2274,6 @@ public:
/// effectively than the base graph.
bool isTreeNotExtendable() const;
- /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
- /// can be load combined in the backend. Load combining may not be allowed in
- /// the IR optimizer, so we do not want to alter the pattern. For example,
- /// partially transforming a scalar bswap() pattern into vector code is
- /// effectively impossible for the backend to undo.
- /// TODO: If load combining is allowed in the IR optimizer, this analysis
- /// may not be necessary.
- bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
-
- /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
- /// can be load combined in the backend. Load combining may not be allowed in
- /// the IR optimizer, so we do not want to alter the pattern. For example,
- /// partially transforming a scalar bswap() pattern into vector code is
- /// effectively impossible for the backend to undo.
- /// TODO: If load combining is allowed in the IR optimizer, this analysis
- /// may not be necessary.
- bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
Align Alignment, const int64_t Diff,
const size_t Sz) const;
@@ -3932,8 +3919,9 @@ private:
/// .., 56))-like pattern.
/// If the int shifts unique, also strided, but not ordered, sets \p Order.
/// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
- bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
- bool &IsBSwap) const;
+ /// If the root nodes are loads, sets \p ForLoads to true.
+ bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
+ bool &ForLoads) const;
class TreeEntry {
public:
@@ -4066,6 +4054,8 @@ private:
FMulAdd,
ReducedBitcast,
ReducedBitcastBSwap,
+ ReducedBitcastLoads,
+ ReducedBitcastBSwapLoads,
};
CombinedOpcode CombinedOp = NotCombinedOp;
@@ -13353,10 +13343,11 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
}
bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
- bool &IsBSwap) const {
+ bool &IsBSwap, bool &ForLoads) const {
assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
"Expected Shl node.");
IsBSwap = false;
+ ForLoads = false;
if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
!TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
@@ -13463,6 +13454,44 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
if (BSwapCost <= BitcastCost) {
BitcastCost = BSwapCost;
IsBSwap = true;
+ Order.clear();
+ // Check for loads in the ZExt node.
+ const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
+ if (SrcTE->State == TreeEntry::Vectorize &&
+ SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
+ SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
+ all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
+ auto *LI = cast<LoadInst>(SrcTE->getMainOp());
+ IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy,
+ {ScalarTy});
+ InstructionCost BSwapCost =
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind) +
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ if (BSwapCost <= BitcastCost) {
+ VecCost +=
+ TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+ BitcastCost = BSwapCost;
+ ForLoads = true;
+ }
+ }
+ }
+ } else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
+ // Check for loads in the ZExt node.
+ const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
+ if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
+ SrcTE->ReuseShuffleIndices.empty() &&
+ SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
+ all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
+ auto *LI = cast<LoadInst>(SrcTE->getMainOp());
+ BitcastCost =
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+ VecCost +=
+ TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+ ForLoads = true;
}
}
return BitcastCost < VecCost;
@@ -13884,14 +13913,17 @@ void BoUpSLP::transformNodes() {
break;
OrdersType Order;
bool IsBSwap;
- if (!matchesShlZExt(E, Order, IsBSwap))
+ bool ForLoads;
+ if (!matchesShlZExt(E, Order, IsBSwap, ForLoads))
break;
// This node is a (reduced disjoint or) bitcast node.
TreeEntry::CombinedOpcode Code =
- IsBSwap ? TreeEntry::ReducedBitcastBSwap : TreeEntry::ReducedBitcast;
+ IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
+ : TreeEntry::ReducedBitcastBSwap)
+ : (ForLoads ? TreeEntry::ReducedBitcastLoads
+ : TreeEntry::ReducedBitcast);
E.CombinedOp = Code;
- if (!IsBSwap)
- E.ReorderIndices = std::move(Order);
+ E.ReorderIndices = std::move(Order);
TreeEntry *ZExtEntry = getOperandEntry(&E, 0);
assert(ZExtEntry->UserTreeIndex &&
ZExtEntry->State == TreeEntry::Vectorize &&
@@ -13900,6 +13932,16 @@ void BoUpSLP::transformNodes() {
// The ZExt node is part of the combined node.
ZExtEntry->State = TreeEntry::CombinedVectorize;
ZExtEntry->CombinedOp = Code;
+ if (ForLoads) {
+ TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
+ assert(LoadsEntry->UserTreeIndex &&
+ LoadsEntry->State == TreeEntry::Vectorize &&
+ LoadsEntry->getOpcode() == Instruction::Load &&
+ "Expected Load node.");
+ // The Load node is part of the combined node.
+ LoadsEntry->State = TreeEntry::CombinedVectorize;
+ LoadsEntry->CombinedOp = Code;
+ }
TreeEntry *ConstEntry = getOperandEntry(&E, 1);
assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
"Expected ZExt node.");
@@ -15559,6 +15601,44 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
+ case TreeEntry::ReducedBitcastLoads:
+ case TreeEntry::ReducedBitcastBSwapLoads: {
+ auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
+ if (isa<PoisonValue>(UniqueValues[Idx]))
+ return InstructionCost(TTI::TCC_Free);
+ auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
+ if (!Shl)
+ return InstructionCost(TTI::TCC_Free);
+ InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
+ auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
+ if (!ZExt)
+ return ScalarCost;
+ ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
+ auto *Load = dyn_cast<Instruction>(ZExt->getOperand(0));
+ if (!Load)
+ return ScalarCost;
+ ScalarCost += TTI.getInstructionCost(Load, CostKind);
+ return ScalarCost;
+ };
+ auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+ const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
+ const TreeEntry *LoadTE = getOperandEntry(LhsTE, /*Idx=*/0);
+ auto *LI0 = cast<LoadInst>(LoadTE->getMainOp());
+ auto *OrigScalarTy = E->getMainOp()->getType();
+ InstructionCost LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, OrigScalarTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind);
+ if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
+ IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, OrigScalarTy,
+ {OrigScalarTy});
+ InstructionCost IntrinsicCost =
+ TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+ LoadCost += IntrinsicCost;
+ }
+ return LoadCost + CommonCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
@@ -16043,69 +16123,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
return true;
}
-static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
- TargetTransformInfo *TTI,
- bool MustMatchOrInst) {
- // Look past the root to find a source value. Arbitrarily follow the
- // path through operand 0 of any 'or'. Also, peek through optional
- // shift-left-by-multiple-of-8-bits.
- Value *ZextLoad = Root;
- const APInt *ShAmtC;
- bool FoundOr = false;
- while (!isa<ConstantExpr>(ZextLoad) &&
- (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
- (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
- ShAmtC->urem(8) == 0))) {
- auto *BinOp = cast<BinaryOperator>(ZextLoad);
- ZextLoad = BinOp->getOperand(0);
- if (BinOp->getOpcode() == Instruction::Or)
- FoundOr = true;
- }
- // Check if the input is an extended load of the required or/shift expression.
- Value *Load;
- if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
- !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
- return false;
-
- // Require that the total load bit width is a legal integer type.
- // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
- // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
- Type *SrcTy = Load->getType();
- unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
- if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
- return false;
-
- // Everything matched - assume that we can fold the whole sequence using
- // load combining.
- LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
- << *(cast<Instruction>(Root)) << "\n");
-
- return true;
-}
-
-bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
- if (RdxKind != RecurKind::Or)
- return false;
-
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- Value *FirstReduced = VectorizableTree[0]->Scalars[0];
- return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
- /* MatchOr */ false);
-}
-
-bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
- // Peek through a final sequence of stores and check if all operations are
- // likely to be load-combined.
- unsigned NumElts = Stores.size();
- for (Value *Scalar : Stores) {
- Value *X;
- if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
- !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
- return false;
- }
- return true;
-}
-
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
if (!DebugCounter::shouldExecute(VectorizedGraphs))
return true;
@@ -16334,7 +16351,9 @@ InstructionCost BoUpSLP::getSpillCost() {
SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
for (const auto &TEPtr : VectorizableTree) {
if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
- TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
+ TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
+ TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
+ TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) {
ScalarOrPseudoEntries.insert(TEPtr.get());
continue;
}
@@ -20269,6 +20288,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
switch (E->CombinedOp) {
case TreeEntry::ReducedBitcast:
case TreeEntry::ReducedBitcastBSwap:
+ case TreeEntry::ReducedBitcastLoads:
+ case TreeEntry::ReducedBitcastBSwapLoads:
ShuffleOrOp = E->CombinedOp;
break;
default:
@@ -21236,6 +21257,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
++NumVectorInstructions;
return V;
}
+ case TreeEntry::ReducedBitcastLoads:
+ case TreeEntry::ReducedBitcastBSwapLoads: {
+ assert(UserIgnoreList && "Expected reduction operations only.");
+ TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
+ TreeEntry *Load = getOperandEntry(ZExt, /*Idx=*/0);
+ setInsertPointAfterBundle(Load);
+ ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
+ ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
+ TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
+ Const->VectorizedValue = PoisonValue::get(getWidenedType(
+ Const->Scalars.front()->getType(), Const->getVectorFactor()));
+ Load->VectorizedValue = PoisonValue::get(getWidenedType(
+ Load->getMainOp()->getType(), Load->getVectorFactor()));
+ LoadInst *LI = cast<LoadInst>(Load->getMainOp());
+ Value *PO = LI->getPointerOperand();
+ Type *ScalarTy = ZExt->getMainOp()->getType();
+ Value *V = Builder.CreateAlignedLoad(ScalarTy, PO, LI->getAlign());
+ ++NumVectorInstructions;
+ if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
+ V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
+ ++NumVectorInstructions;
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
default:
llvm_unreachable("unknown inst");
}
@@ -21260,10 +21306,15 @@ Value *BoUpSLP::vectorizeTree(
// Cache last instructions for the nodes to avoid side effects, which may
// appear during vectorization, like extra uses, etc.
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ // Need to generate insertion point for loads nodes of the bitcast/bswap
+ // ops.
if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
(TE->State == TreeEntry::CombinedVectorize &&
(TE->CombinedOp == TreeEntry::ReducedBitcast ||
- TE->CombinedOp == TreeEntry::ReducedBitcastBSwap)))
+ TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
+ ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
+ TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) &&
+ (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
continue;
(void)getLastInstructionInBundle(TE.get());
}
@@ -21822,7 +21873,9 @@ Value *BoUpSLP::vectorizeTree(
continue;
if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
- Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
+ Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
+ Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
+ Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) {
// Skip constant node
if (!Entry->hasState()) {
assert(allConstant(Entry->Scalars) && "Expected constants only.");
@@ -24138,8 +24191,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
return false;
}
}
- if (R.isLoadCombineCandidate(Chain))
- return true;
R.buildTree(Chain);
// Check if tree tiny and store itself or its value is not vectorized.
if (R.isTreeTinyAndNotFullyVectorizable()) {
@@ -25744,11 +25795,6 @@ public:
V.analyzedReductionVals(VL);
continue;
}
- if (V.isLoadCombineReductionCandidate(RdxKind)) {
- if (!AdjustReducedVals())
- V.analyzedReductionVals(VL);
- continue;
- }
V.reorderTopToBottom();
// No need to reorder the root node at all for reassociative reduction.
V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||