diff options
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 466 |
1 files changed, 268 insertions, 198 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3a6b3dd28b73..f631ec963d49 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17527,34 +17527,50 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } - if (VectorizableTree.size() == 1 && !ForReduction && - VectorizableTree.front()->isGather() && - VectorizableTree.front()->hasState() && - VectorizableTree.front()->getOpcode() == Instruction::ExtractElement) - return true; - if (VectorizableTree.size() == 1 && !ForReduction && - VectorizableTree.front()->isGather() && - any_of(VectorizableTree.front()->Scalars, IsaPred<Instruction>)) - return true; - if (VectorizableTree.size() <= MinTreeSize && !ForReduction && - all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { - return TE->isGather() || TE->State == TreeEntry::SplitVectorize; - })) - return true; - if (VectorizableTree.size() == 1 && !ForReduction && SLPCostThreshold < 0 && - VectorizableTree.front()->hasState() && - VectorizableTree.front()->getOpcode() == Instruction::ExtractElement && - (VectorizableTree.front()->getVectorFactor() == 2 || - all_of( - VectorizableTree.front()->Scalars, - [&](Value *V) { - auto *I = dyn_cast<Instruction>(V); - return !I || !areAllUsersVectorized(I, UserIgnoreList); - }))) - return true; + // Cache values from the root node and the cost-threshold options to avoid + // re-querying them inside hot predicates below. + const unsigned TreeSize = VectorizableTree.size(); + const TreeEntry &Front = *VectorizableTree.front(); + const bool FrontIsGather = Front.isGather(); + const bool FrontHasState = Front.hasState(); + const unsigned FrontOpcode = FrontHasState ? Front.getOpcode() : 0u; + const bool ThresholdSet = SLPCostThreshold.getNumOccurrences() > 0; + const bool ThresholdNonNegative = SLPCostThreshold >= 0; + + constexpr int Limit = 4; + constexpr unsigned LargeTree = 20; + constexpr int LimitTreeSize = 36; + + // The remaining size-1/size-<=MinTreeSize early bail-outs only apply to + // non-reduction trees; group them under a single guard to avoid 3 separate + // !ForReduction short-circuits when reducing. + if (!ForReduction) { + // Single gather node: bail out for ExtractElement or any node containing a + // real Instruction scalar. + if (TreeSize == 1 && FrontIsGather) { + if (FrontHasState && FrontOpcode == Instruction::ExtractElement) + return true; + if (any_of(Front.Scalars, IsaPred<Instruction>)) + return true; + } + if (TreeSize <= MinTreeSize && + all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) { + return TE->isGather() || TE->State == TreeEntry::SplitVectorize; + })) + return true; + if (TreeSize == 1 && SLPCostThreshold < 0 && FrontHasState && + FrontOpcode == Instruction::ExtractElement && + (Front.getVectorFactor() == 2 || + all_of( + Front.Scalars, + [&](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return !I || !areAllUsersVectorized(I, UserIgnoreList); + }))) + return true; + } // No need to vectorize inserts of gathered values. - if (VectorizableTree.size() == 2 && - isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && + if (TreeSize == 2 && isa<InsertElementInst>(Front.Scalars[0]) && VectorizableTree[1]->isGather() && (VectorizableTree[1]->getVectorFactor() <= 2 || !(isSplat(VectorizableTree[1]->Scalars) || @@ -17563,11 +17579,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // The tree with only 3 nodes, where 2 last are gathers/buildvectors, not // profitable for vectorization. - constexpr int Limit = 4; - if (VectorizableTree.size() == 3 && SLPCostThreshold == 0 && - (!ForReduction || VectorizableTree.front()->getVectorFactor() <= 2) && + if (TreeSize == 3 && SLPCostThreshold == 0 && + (!ForReduction || Front.getVectorFactor() <= 2) && all_of(ArrayRef(VectorizableTree).drop_front(), - [&](const std::unique_ptr<TreeEntry> &TE) { + [](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather() && TE->getVectorFactor() <= Limit && !all_of( TE->Scalars, @@ -17575,161 +17590,218 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { })) return true; - // If the graph includes only PHI nodes and gathers, it is defnitely not - // profitable for the vectorization, we can skip it, if the cost threshold is - // default. The cost of vectorized PHI nodes is almost always 0 + the cost of - // gathers/buildvectors. - if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && - !VectorizableTree.empty() && - all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { - return (TE->isGather() && - (!TE->hasState() || - TE->getOpcode() != Instruction::ExtractElement) && - count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) || - (TE->hasState() && TE->getOpcode() == Instruction::PHI); - })) - return true; + // All remaining bail-out heuristics require !ForReduction. Group them under + // a single guard so reduction trees skip them with one branch instead of one + // per check. + if (!ForReduction) { + // If the graph includes only PHI nodes and gathers, it is defnitely not + // profitable for the vectorization, we can skip it, if the cost threshold + // is default. The cost of vectorized PHI nodes is almost always 0 + the + // cost of gathers/buildvectors. + if (!ThresholdSet && + all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) { + const bool IsGather = TE->isGather(); + const bool HasState = TE->hasState(); + const unsigned Op = HasState ? TE->getOpcode() : 0u; + if (IsGather && (!HasState || Op != Instruction::ExtractElement) && + count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) + return true; + return HasState && Op == Instruction::PHI; + })) + return true; - // Do not vectorize small tree of phis only, if all vector phis are also - // gathered. - if (!ForReduction && SLPCostThreshold.getNumOccurrences() && - VectorizableTree.size() <= Limit && - all_of(VectorizableTree, - [&](const std::unique_ptr<TreeEntry> &TE) { - return (TE->isGather() && - (!TE->hasState() || - TE->getOpcode() != Instruction::ExtractElement) && - count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= - Limit) || - (TE->hasState() && - (TE->getOpcode() == Instruction::InsertElement || - (TE->getOpcode() == Instruction::PHI && - all_of(TE->Scalars, [&](Value *V) { - return isa<PoisonValue>(V) || MustGather.contains(V); - })))); - }) && - any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { - return TE->State == TreeEntry::Vectorize && - TE->getOpcode() == Instruction::PHI; - })) - return true; + // Do not vectorize small tree of phis only, if all vector phis are also + // gathered. + if (ThresholdSet && TreeSize <= Limit) { + bool HasVectorPhi = false; + auto Compatible = [&](const std::unique_ptr<TreeEntry> &TE) { + const bool IsGather = TE->isGather(); + const bool HasState = TE->hasState(); + const unsigned Op = HasState ? TE->getOpcode() : 0u; + if (IsGather && (!HasState || Op != Instruction::ExtractElement) && + count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) + return true; + if (!HasState) + return false; + if (Op == Instruction::InsertElement) + return true; + if (Op != Instruction::PHI) + return false; + if (TE->State == TreeEntry::Vectorize) + HasVectorPhi = true; + return all_of(TE->Scalars, [&](Value *V) { + return isa<PoisonValue>(V) || MustGather.contains(V); + }); + }; + if (all_of(VectorizableTree, Compatible) && HasVectorPhi) + return true; + } - // PHI nodes only and gathers cannot be vectorized, skip. - constexpr unsigned LargeTree = 20; - bool HasSingleLoad = false; - if (!ForReduction && SLPCostThreshold >= 0 && - all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { - bool PrevLoad = HasSingleLoad; - HasSingleLoad |= - TE->hasState() && !TE->isGather() && - (TE->getOpcode() == Instruction::Load || - TE->hasCopyableElements()) && - (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty()); - return (TE->hasState() && - (TE->getOpcode() == Instruction::PHI || - (VectorizableTree.size() >= LargeTree && - (TE->getOpcode() == Instruction::Store || - (TE->getOpcode() == Instruction::Load && !PrevLoad)) && - TE->getVectorFactor() <= Limit))) || - (TE->isGather() && - (!TE->hasState() || - TE->getOpcode() != Instruction::ExtractElement)); - })) - return true; + // PHI nodes only and gathers cannot be vectorized, skip. + if (ThresholdNonNegative) { + const bool IsLargeTree = TreeSize >= LargeTree; + bool HasSingleLoad = false; + if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + const bool IsGather = TE->isGather(); + const bool HasState = TE->hasState(); + const unsigned Op = HasState ? TE->getOpcode() : 0u; + // HasSingleLoad/PrevLoad are only consulted in the + // IsLargeTree branch; skip the bookkeeping otherwise. + if (IsLargeTree) { + const bool PrevLoad = HasSingleLoad; + HasSingleLoad |= + HasState && !IsGather && + (Op == Instruction::Load || TE->hasCopyableElements()) && + (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty()); + if (HasState) { + if (Op == Instruction::PHI) + return true; + if (TE->getVectorFactor() <= Limit && + (Op == Instruction::Store || + (Op == Instruction::Load && !PrevLoad))) + return true; + } + } else if (HasState && Op == Instruction::PHI) { + return true; + } + return IsGather && (!HasState || Op != Instruction::ExtractElement); + })) + return true; - // Single non-phi vector node - skip the tree. - bool VectorNodeFound = false; - bool AnyNonConst = false; - if (!ForReduction && SLPCostThreshold >= 0 && VectorizableTree.size() >= 5 && - VectorizableTree.front()->getVectorFactor() <= 2 && - VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() && - all_of(VectorizableTree, - [&](const std::unique_ptr<TreeEntry> &TE) { - if (TE->State == TreeEntry::Vectorize && TE->hasState()) { - if (TE->hasState() && (TE->getOpcode() == Instruction::PHI || - !TE->ReorderIndices.empty())) - return true; - bool PrevVectorNodeFound = VectorNodeFound; - VectorNodeFound = true; - return !PrevVectorNodeFound; - } - AnyNonConst |= !allConstant(TE->Scalars); - return TE->isGather() || TE->State == TreeEntry::SplitVectorize; - }) && - AnyNonConst) - return true; + // Single non-phi vector node - skip the tree. + if (TreeSize >= 5 && Front.getVectorFactor() <= 2 && + Front.Scalars.front()->getType()->isIntegerTy()) { + bool VectorNodeFound = false; + bool AnyNonConst = false; + if (all_of(VectorizableTree, + [&](const std::unique_ptr<TreeEntry> &TE) { + if (TE->State == TreeEntry::Vectorize && TE->hasState()) { + const unsigned Op = TE->getOpcode(); + if (Op == Instruction::PHI || + !TE->ReorderIndices.empty()) + return true; + if (VectorNodeFound) + return false; + VectorNodeFound = true; + return true; + } + // Once AnyNonConst is true, skip the O(n) allConstant + // walk for subsequent entries. + if (!AnyNonConst) + AnyNonConst = !allConstant(TE->Scalars); + return TE->isGather() || + TE->State == TreeEntry::SplitVectorize; + }) && + AnyNonConst) + return true; + } + } - // If the tree contains only phis, buildvectors, split nodes and - // small nodes with reuses, we can skip it. - SmallVector<const TreeEntry *> StoreLoadNodes; - unsigned NumGathers = 0; - constexpr int LimitTreeSize = 36; - if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && - all_of(VectorizableTree, - [&](const std::unique_ptr<TreeEntry> &TE) { - if (!TE->isGather() && TE->hasState() && - (TE->getOpcode() == Instruction::Load || - TE->getOpcode() == Instruction::Store)) { - StoreLoadNodes.push_back(TE.get()); - return true; - } - if (TE->isGather()) - ++NumGathers; - return TE->State == TreeEntry::SplitVectorize || - (TE->Idx == 0 && TE->Scalars.size() == 2 && - TE->hasState() && TE->getOpcode() == Instruction::ICmp && - VectorizableTree.size() > LimitTreeSize) || - (TE->isGather() && - none_of(TE->Scalars, IsaPred<ExtractElementInst>)) || - (TE->hasState() && - (TE->getOpcode() == Instruction::PHI || - (TE->hasCopyableElements() && - static_cast<unsigned>(count_if( - TE->Scalars, IsaPred<PHINode, Constant>)) >= - TE->Scalars.size() / 2) || - ((!TE->ReuseShuffleIndices.empty() || - !TE->ReorderIndices.empty() || TE->isAltShuffle()) && - TE->Scalars.size() == 2))); - }) && - (StoreLoadNodes.empty() || - (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() && - (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) { - return TE->getOpcode() == Instruction::Store || - all_of(TE->Scalars, [&](Value *V) { - return !isa<LoadInst>(V) || - areAllUsersVectorized(cast<Instruction>(V)); - }); - }))))) - return true; + // Common predicate for "phis, buildvectors, split nodes and small nodes + // with reuses" used by the two checks below. Cheap checks are evaluated + // before expensive Scalars walks. + auto IsBenignNode = [&](const TreeEntry &TE) { + if (TE.State == TreeEntry::SplitVectorize) + return true; + const bool IsGather = TE.isGather(); + const bool HasState = TE.hasState(); + if (HasState) { + const unsigned Op = TE.getOpcode(); + if (Op == Instruction::PHI) + return true; + const unsigned ScalarsSize = TE.Scalars.size(); + if (TE.Idx == 0 && ScalarsSize == 2 && Op == Instruction::ICmp && + TreeSize > LimitTreeSize) + return true; + if (ScalarsSize == 2 && + (!TE.ReuseShuffleIndices.empty() || !TE.ReorderIndices.empty() || + TE.isAltShuffle())) + return true; + if (TE.hasCopyableElements() && + static_cast<unsigned>(count_if( + TE.Scalars, IsaPred<PHINode, Constant>)) >= ScalarsSize / 2) + return true; + } + return IsGather && none_of(TE.Scalars, IsaPred<ExtractElementInst>); + }; - // If the tree contains only buildvector, 2 non-buildvectors (with root user - // tree node) and other buildvectors, we can skip it. - if (!ForReduction && SLPCostThreshold.getNumOccurrences() && - VectorizableTree.front()->State == TreeEntry::SplitVectorize && - VectorizableTree.size() >= Limit && - count_if(ArrayRef(VectorizableTree).drop_front(), - [&](const std::unique_ptr<TreeEntry> &TE) { - return !TE->isGather() && TE->UserTreeIndex.UserTE && - TE->UserTreeIndex.UserTE->Idx == 0; - }) == 2) - return true; + // If the tree contains only phis, buildvectors, split nodes and + // small nodes with reuses, we can skip it. + if (!ThresholdSet) { + SmallVector<const TreeEntry *> StoreLoadNodes; + unsigned NumGathers = 0; + if (all_of(VectorizableTree, + [&](const std::unique_ptr<TreeEntry> &TE) { + const bool IsGather = TE->isGather(); + if (!IsGather && TE->hasState()) { + const unsigned Op = TE->getOpcode(); + if (Op == Instruction::Load || Op == Instruction::Store) { + StoreLoadNodes.push_back(TE.get()); + return true; + } + } + if (IsGather) + ++NumGathers; + return IsBenignNode(*TE); + }) && + (StoreLoadNodes.empty() || + (TreeSize > LimitTreeSize * StoreLoadNodes.size() && + (NumGathers > 0 || + none_of(StoreLoadNodes, [&](const TreeEntry *TE) { + return TE->getOpcode() == Instruction::Store || + all_of(TE->Scalars, [&](Value *V) { + return !isa<LoadInst>(V) || + areAllUsersVectorized(cast<Instruction>(V)); + }); + }))))) + return true; + } - // If the tree contains only vectorization of the phi node from the - // buildvector - skip it. - if (!ForReduction && SLPCostThreshold.getNumOccurrences() && - VectorizableTree.size() > 2 && - VectorizableTree.front()->State == TreeEntry::Vectorize && - VectorizableTree.front()->getOpcode() == Instruction::InsertElement && - VectorizableTree[1]->State == TreeEntry::Vectorize && - VectorizableTree[1]->getOpcode() == Instruction::PHI && - all_of( - ArrayRef(VectorizableTree).drop_front(2), - [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); })) - return true; + // If the tree contains only phis, buildvectors, split nodes and + // small nodes with reuses, we can skip it. + if (ThresholdNonNegative && TreeSize > LimitTreeSize) { + const TreeEntry *VectorNode = nullptr; + if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + if (!TE->isGather() && TE->hasState() && + TE->State != TreeEntry::SplitVectorize && + TE->getOpcode() != Instruction::PHI) { + if (VectorNode) + return false; + VectorNode = TE.get(); + return true; + } + return IsBenignNode(*TE); + })) + return true; + } + + // If the tree contains only buildvector, 2 non-buildvectors (with root + // user tree node) and other buildvectors, we can skip it. + if (ThresholdSet && TreeSize >= Limit && + Front.State == TreeEntry::SplitVectorize && + count_if(ArrayRef(VectorizableTree).drop_front(), + [](const std::unique_ptr<TreeEntry> &TE) { + return !TE->isGather() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->Idx == 0; + }) == 2) + return true; + + // If the tree contains only vectorization of the phi node from the + // buildvector - skip it. + if (ThresholdSet && TreeSize > 2 && Front.State == TreeEntry::Vectorize && + FrontOpcode == Instruction::InsertElement && + VectorizableTree[1]->State == TreeEntry::Vectorize && + VectorizableTree[1]->getOpcode() == Instruction::PHI && + all_of(ArrayRef(VectorizableTree).drop_front(2), + [](const std::unique_ptr<TreeEntry> &TE) { + return TE->isGather(); + })) + return true; + } // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. - if (VectorizableTree.size() >= MinTreeSize) + if (TreeSize >= MinTreeSize) return false; // If we have a tiny tree (a tree whose size is less than MinTreeSize), we @@ -17738,14 +17810,13 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return false; // Check if any of the gather node forms an insertelement buildvector - // somewhere. - bool IsAllowedSingleBVNode = - VectorizableTree.size() > 1 || - (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() && - !VectorizableTree.front()->isAltShuffle() && - VectorizableTree.front()->getOpcode() != Instruction::PHI && - VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr && - allSameBlock(VectorizableTree.front()->Scalars)); + // somewhere. TreeSize >= 1 is guaranteed, so the multi-node case reduces to + // a simple TreeSize > 1 short-circuit. + const bool IsAllowedSingleBVNode = + TreeSize > 1 || (FrontHasState && !Front.isAltShuffle() && + FrontOpcode != Instruction::PHI && + FrontOpcode != Instruction::GetElementPtr && + allSameBlock(Front.Scalars)); if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather() && all_of(TE->Scalars, [&](Value *V) { return isa<ExtractElementInst, Constant>(V) || @@ -17756,19 +17827,18 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { })) return false; - if (VectorizableTree.back()->isGather() && - VectorizableTree.back()->hasState() && - VectorizableTree.back()->isAltShuffle() && - VectorizableTree.back()->getVectorFactor() > 2 && - allSameBlock(VectorizableTree.back()->Scalars) && - !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() && - TTI->getScalarizationOverhead( - getWidenedType(VectorizableTree.back()->Scalars.front()->getType(), - VectorizableTree.back()->getVectorFactor()), - APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()), - /*Insert=*/true, /*Extract=*/false, - TTI::TCK_RecipThroughput) > -SLPCostThreshold) - return false; + const TreeEntry &Back = *VectorizableTree.back(); + if (Back.isGather() && Back.hasState() && Back.isAltShuffle()) { + const unsigned BackVF = Back.getVectorFactor(); + if (BackVF > 2 && allSameBlock(Back.Scalars) && + !Back.Scalars.front()->getType()->isVectorTy() && + TTI->getScalarizationOverhead( + getWidenedType(Back.Scalars.front()->getType(), BackVF), + APInt::getAllOnes(BackVF), + /*Insert=*/true, /*Extract=*/false, + TTI::TCK_RecipThroughput) > -SLPCostThreshold) + return false; + } // Otherwise, we can't vectorize the tree. It is both tiny and not fully // vectorizable. |
