diff options
| author | Alexey Bataev <a.bataev@outlook.com> | 2026-02-05 10:42:08 -0500 |
|---|---|---|
| committer | Alexey Bataev <a.bataev@outlook.com> | 2026-02-05 11:16:08 -0800 |
| commit | fe754dff6d7ed3f7d69e5a97846d9320cb407977 (patch) | |
| tree | 91d35dcc6eb38ac6a4bab326dbf8bd5872a89ecc /llvm/lib/Transforms/Vectorize | |
| parent | 5326166866e3a1c2ecc260830d19b03054ff6418 (diff) | |
| download | llvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.tar.gz llvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.tar.bz2 llvm-fe754dff6d7ed3f7d69e5a97846d9320cb407977.zip | |
[SLP]Remove LoadCombine workaround after handling of the copyables
LoadCombine pattern handling was added as a workaround for the cases,
where the SLP vectorizer could not vectorize the code effectively. With
the copyables support, it can handle it directly.
Also, patch adds support for scalar loads[ + bswap] pattern for byte
sized loads (+ reverse bytes for bswap)
Recommit after revert in 6377c86d718232fe60c548dfd7ab439f7ff84df7
Reviewers: RKSimon, hiraditya
Pull Request: https://github.com/llvm/llvm-project/pull/174205
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 244 |
1 files changed, 145 insertions, 99 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7575cfb25051..f89c22fafcf0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2100,12 +2100,16 @@ public: VectorizableTree.front()->getVectorFactor()); } - /// Returns the opcode of the root node, or 0, if the root node is gather. + /// Returns true if the tree results in one of the reduced bitcasts variants. bool isReducedBitcastRoot() const { return VectorizableTree.front()->hasState() && (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast || VectorizableTree.front()->CombinedOp == - TreeEntry::ReducedBitcastBSwap) && + TreeEntry::ReducedBitcastBSwap || + VectorizableTree.front()->CombinedOp == + TreeEntry::ReducedBitcastLoads || + VectorizableTree.front()->CombinedOp == + TreeEntry::ReducedBitcastBSwapLoads) && VectorizableTree.front()->State == TreeEntry::Vectorize; } @@ -2270,23 +2274,6 @@ public: /// effectively than the base graph. bool isTreeNotExtendable() const; - /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values - /// can be load combined in the backend. Load combining may not be allowed in - /// the IR optimizer, so we do not want to alter the pattern. For example, - /// partially transforming a scalar bswap() pattern into vector code is - /// effectively impossible for the backend to undo. - /// TODO: If load combining is allowed in the IR optimizer, this analysis - /// may not be necessary. - bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; - - /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values - /// can be load combined in the backend. Load combining may not be allowed in - /// the IR optimizer, so we do not want to alter the pattern. For example, - /// partially transforming a scalar bswap() pattern into vector code is - /// effectively impossible for the backend to undo. - /// TODO: If load combining is allowed in the IR optimizer, this analysis - /// may not be necessary. - bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const; @@ -3932,8 +3919,9 @@ private: /// .., 56))-like pattern. /// If the int shifts unique, also strided, but not ordered, sets \p Order. /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap. - bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, - bool &IsBSwap) const; + /// If the root nodes are loads, sets \p ForLoads to true. + bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap, + bool &ForLoads) const; class TreeEntry { public: @@ -4066,6 +4054,8 @@ private: FMulAdd, ReducedBitcast, ReducedBitcastBSwap, + ReducedBitcastLoads, + ReducedBitcastBSwapLoads, }; CombinedOpcode CombinedOp = NotCombinedOp; @@ -13353,10 +13343,11 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, } bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order, - bool &IsBSwap) const { + bool &IsBSwap, bool &ForLoads) const { assert(TE.hasState() && TE.getOpcode() == Instruction::Shl && "Expected Shl node."); IsBSwap = false; + ForLoads = false; if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() || !TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) || any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); })) @@ -13463,6 +13454,44 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order, if (BSwapCost <= BitcastCost) { BitcastCost = BSwapCost; IsBSwap = true; + Order.clear(); + // Check for loads in the ZExt node. + const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0); + if (SrcTE->State == TreeEntry::Vectorize && + SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() && + SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() && + all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) { + auto *LI = cast<LoadInst>(SrcTE->getMainOp()); + IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy, + {ScalarTy}); + InstructionCost BSwapCost = + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind) + + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + if (BSwapCost <= BitcastCost) { + VecCost += + TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + BitcastCost = BSwapCost; + ForLoads = true; + } + } + } + } else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) { + // Check for loads in the ZExt node. + const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0); + if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() && + SrcTE->ReuseShuffleIndices.empty() && + SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() && + all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) { + auto *LI = cast<LoadInst>(SrcTE->getMainOp()); + BitcastCost = + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + VecCost += + TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + ForLoads = true; } } return BitcastCost < VecCost; @@ -13884,14 +13913,17 @@ void BoUpSLP::transformNodes() { break; OrdersType Order; bool IsBSwap; - if (!matchesShlZExt(E, Order, IsBSwap)) + bool ForLoads; + if (!matchesShlZExt(E, Order, IsBSwap, ForLoads)) break; // This node is a (reduced disjoint or) bitcast node. TreeEntry::CombinedOpcode Code = - IsBSwap ? TreeEntry::ReducedBitcastBSwap : TreeEntry::ReducedBitcast; + IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads + : TreeEntry::ReducedBitcastBSwap) + : (ForLoads ? TreeEntry::ReducedBitcastLoads + : TreeEntry::ReducedBitcast); E.CombinedOp = Code; - if (!IsBSwap) - E.ReorderIndices = std::move(Order); + E.ReorderIndices = std::move(Order); TreeEntry *ZExtEntry = getOperandEntry(&E, 0); assert(ZExtEntry->UserTreeIndex && ZExtEntry->State == TreeEntry::Vectorize && @@ -13900,6 +13932,16 @@ void BoUpSLP::transformNodes() { // The ZExt node is part of the combined node. ZExtEntry->State = TreeEntry::CombinedVectorize; ZExtEntry->CombinedOp = Code; + if (ForLoads) { + TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0); + assert(LoadsEntry->UserTreeIndex && + LoadsEntry->State == TreeEntry::Vectorize && + LoadsEntry->getOpcode() == Instruction::Load && + "Expected Load node."); + // The Load node is part of the combined node. + LoadsEntry->State = TreeEntry::CombinedVectorize; + LoadsEntry->CombinedOp = Code; + } TreeEntry *ConstEntry = getOperandEntry(&E, 1); assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() && "Expected ZExt node."); @@ -15559,6 +15601,44 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, }; return GetCostDiff(GetScalarCost, GetVectorCost); } + case TreeEntry::ReducedBitcastLoads: + case TreeEntry::ReducedBitcastBSwapLoads: { + auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) { + if (isa<PoisonValue>(UniqueValues[Idx])) + return InstructionCost(TTI::TCC_Free); + auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]); + if (!Shl) + return InstructionCost(TTI::TCC_Free); + InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind); + auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0)); + if (!ZExt) + return ScalarCost; + ScalarCost += TTI.getInstructionCost(ZExt, CostKind); + auto *Load = dyn_cast<Instruction>(ZExt->getOperand(0)); + if (!Load) + return ScalarCost; + ScalarCost += TTI.getInstructionCost(Load, CostKind); + return ScalarCost; + }; + auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) { + const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0); + const TreeEntry *LoadTE = getOperandEntry(LhsTE, /*Idx=*/0); + auto *LI0 = cast<LoadInst>(LoadTE->getMainOp()); + auto *OrigScalarTy = E->getMainOp()->getType(); + InstructionCost LoadCost = + TTI.getMemoryOpCost(Instruction::Load, OrigScalarTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind); + if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) { + IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, OrigScalarTy, + {OrigScalarTy}); + InstructionCost IntrinsicCost = + TTI.getIntrinsicInstrCost(CostAttrs, CostKind); + LoadCost += IntrinsicCost; + } + return LoadCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -16043,69 +16123,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { return true; } -static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, - TargetTransformInfo *TTI, - bool MustMatchOrInst) { - // Look past the root to find a source value. Arbitrarily follow the - // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-multiple-of-8-bits. - Value *ZextLoad = Root; - const APInt *ShAmtC; - bool FoundOr = false; - while (!isa<ConstantExpr>(ZextLoad) && - (match(ZextLoad, m_Or(m_Value(), m_Value())) || - (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && - ShAmtC->urem(8) == 0))) { - auto *BinOp = cast<BinaryOperator>(ZextLoad); - ZextLoad = BinOp->getOperand(0); - if (BinOp->getOpcode() == Instruction::Or) - FoundOr = true; - } - // Check if the input is an extended load of the required or/shift expression. - Value *Load; - if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) - return false; - - // Require that the total load bit width is a legal integer type. - // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. - // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = Load->getType(); - unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; - if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) - return false; - - // Everything matched - assume that we can fold the whole sequence using - // load combining. - LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " - << *(cast<Instruction>(Root)) << "\n"); - - return true; -} - -bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { - if (RdxKind != RecurKind::Or) - return false; - - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, - /* MatchOr */ false); -} - -bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const { - // Peek through a final sequence of stores and check if all operations are - // likely to be load-combined. - unsigned NumElts = Stores.size(); - for (Value *Scalar : Stores) { - Value *X; - if (!match(Scalar, m_Store(m_Value(X), m_Value())) || - !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) - return false; - } - return true; -} - bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (!DebugCounter::shouldExecute(VectorizedGraphs)) return true; @@ -16334,7 +16351,9 @@ InstructionCost BoUpSLP::getSpillCost() { SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries; for (const auto &TEPtr : VectorizableTree) { if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast || - TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap) { + TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap || + TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads || + TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) { ScalarOrPseudoEntries.insert(TEPtr.get()); continue; } @@ -20269,6 +20288,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { switch (E->CombinedOp) { case TreeEntry::ReducedBitcast: case TreeEntry::ReducedBitcastBSwap: + case TreeEntry::ReducedBitcastLoads: + case TreeEntry::ReducedBitcastBSwapLoads: ShuffleOrOp = E->CombinedOp; break; default: @@ -21236,6 +21257,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ++NumVectorInstructions; return V; } + case TreeEntry::ReducedBitcastLoads: + case TreeEntry::ReducedBitcastBSwapLoads: { + assert(UserIgnoreList && "Expected reduction operations only."); + TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0); + TreeEntry *Load = getOperandEntry(ZExt, /*Idx=*/0); + setInsertPointAfterBundle(Load); + ZExt->VectorizedValue = PoisonValue::get(getWidenedType( + ZExt->getMainOp()->getType(), ZExt->getVectorFactor())); + TreeEntry *Const = getOperandEntry(E, /*Idx=*/1); + Const->VectorizedValue = PoisonValue::get(getWidenedType( + Const->Scalars.front()->getType(), Const->getVectorFactor())); + Load->VectorizedValue = PoisonValue::get(getWidenedType( + Load->getMainOp()->getType(), Load->getVectorFactor())); + LoadInst *LI = cast<LoadInst>(Load->getMainOp()); + Value *PO = LI->getPointerOperand(); + Type *ScalarTy = ZExt->getMainOp()->getType(); + Value *V = Builder.CreateAlignedLoad(ScalarTy, PO, LI->getAlign()); + ++NumVectorInstructions; + if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) { + V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V); + ++NumVectorInstructions; + } + E->VectorizedValue = V; + return V; + } default: llvm_unreachable("unknown inst"); } @@ -21260,10 +21306,15 @@ Value *BoUpSLP::vectorizeTree( // Cache last instructions for the nodes to avoid side effects, which may // appear during vectorization, like extra uses, etc. for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + // Need to generate insertion point for loads nodes of the bitcast/bswap + // ops. if (TE->isGather() || DeletedNodes.contains(TE.get()) || (TE->State == TreeEntry::CombinedVectorize && (TE->CombinedOp == TreeEntry::ReducedBitcast || - TE->CombinedOp == TreeEntry::ReducedBitcastBSwap))) + TE->CombinedOp == TreeEntry::ReducedBitcastBSwap || + ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads || + TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) && + (!TE->hasState() || TE->getOpcode() != Instruction::Load))))) continue; (void)getLastInstructionInBundle(TE.get()); } @@ -21822,7 +21873,9 @@ Value *BoUpSLP::vectorizeTree( continue; if (Entry->CombinedOp == TreeEntry::ReducedBitcast || - Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap) { + Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap || + Entry->CombinedOp == TreeEntry::ReducedBitcastLoads || + Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads) { // Skip constant node if (!Entry->hasState()) { assert(allConstant(Entry->Scalars) && "Expected constants only."); @@ -24138,8 +24191,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, return false; } } - if (R.isLoadCombineCandidate(Chain)) - return true; R.buildTree(Chain); // Check if tree tiny and store itself or its value is not vectorized. if (R.isTreeTinyAndNotFullyVectorizable()) { @@ -25744,11 +25795,6 @@ public: V.analyzedReductionVals(VL); continue; } - if (V.isLoadCombineReductionCandidate(RdxKind)) { - if (!AdjustReducedVals()) - V.analyzedReductionVals(VL); - continue; - } V.reorderTopToBottom(); // No need to reorder the root node at all for reassociative reduction. V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() || |
