diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineInternal.h | 3 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/MergeICmps.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 13 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 67 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 80 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 |
11 files changed, 206 insertions, 98 deletions
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index a0f7ec6..2dd0fde 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -948,17 +948,17 @@ void llvm::updateVCallVisibilityInIndex( // linker, as we have no information on their eventual use. if (DynamicExportSymbols.count(P.first)) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; for (auto &S : P.second.getSummaryList()) { auto *GVar = dyn_cast<GlobalVarSummary>(S.get()); if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; - // With validation enabled, we want to exclude symbols visible to regular - // objects. Local symbols will be in this group due to the current - // implementation but those with VCallVisibilityTranslationUnit will have - // already been marked in clang so are unaffected. - if (VisibleToRegularObjSymbols.count(P.first)) - continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1161,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets( // and therefore the same GUID. This can happen if there isn't enough // distinguishing path when compiling the source file. In that case we // conservatively return false early. + if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1) + return false; const GlobalVarSummary *VS = nullptr; - bool LocalFound = false; for (const auto &S : P.VTableVI.getSummaryList()) { - if (GlobalValue::isLocalLinkage(S->linkage())) { - if (LocalFound) - return false; - LocalFound = true; - } auto *CurVS = cast<GlobalVarSummary>(S->getBaseObject()); if (!CurVS->vTableFuncs().empty() || // Previously clang did not attach the necessary type metadata to @@ -1184,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets( // with public LTO visibility. if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic) return false; + break; } } // There will be no VS if all copies are available_externally having no @@ -1411,9 +1408,8 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, // If the summary list contains multiple summaries where at least one is // a local, give up, as we won't know which (possibly promoted) name to use. - for (const auto &S : TheFn.getSummaryList()) - if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1) - return false; + if (TheFn.hasLocal() && Size > 1) + return false; // Collect functions devirtualized at least for one call site for stats. if (PrintSummaryDevirt || AreStatisticsEnabled()) @@ -2591,6 +2587,11 @@ void DevirtIndex::run() { if (ExportSummary.typeIdCompatibleVtableMap().empty()) return; + // Assert that we haven't made any changes that would affect the hasLocal() + // flag on the GUID summary info. + assert(!ExportSummary.withInternalizeAndPromote() && + "Expect index-based WPD to run before internalization and promotion"); + DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID; for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) { NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 73ec451..9bee523 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2760,21 +2760,34 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". Value *LHSOp, *RHSOp; - if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && - match(Op1, m_PtrToInt(m_Value(RHSOp)))) + if (match(Op0, m_PtrToIntOrAddr(m_Value(LHSOp))) && + match(Op1, m_PtrToIntOrAddr(m_Value(RHSOp)))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), I.hasNoUnsignedWrap())) return replaceInstUsesWith(I, Res); // trunc(p)-trunc(q) -> trunc(p-q) - if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && - match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) + if (match(Op0, m_Trunc(m_PtrToIntOrAddr(m_Value(LHSOp)))) && + match(Op1, m_Trunc(m_PtrToIntOrAddr(m_Value(RHSOp))))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), /* IsNUW */ false)) return replaceInstUsesWith(I, Res); - if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && - match(Op1, m_ZExtOrSelf(m_PtrToInt(m_Value(RHSOp))))) { + auto MatchSubOfZExtOfPtrToIntOrAddr = [&]() { + if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && + match(Op1, m_ZExt(m_PtrToIntSameSize(DL, m_Value(RHSOp))))) + return true; + if (match(Op0, m_ZExt(m_PtrToAddr(m_Value(LHSOp)))) && + match(Op1, m_ZExt(m_PtrToAddr(m_Value(RHSOp))))) + return true; + // Special case for non-canonical ptrtoint in constant expression, + // where the zext has been folded into the ptrtoint. + if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) && + match(Op1, m_PtrToInt(m_Value(RHSOp)))) + return true; + return false; + }; + if (MatchSubOfZExtOfPtrToIntOrAddr()) { if (auto *GEP = dyn_cast<GEPOperator>(LHSOp)) { if (GEP->getPointerOperand() == RHSOp) { if (GEP->hasNoUnsignedWrap() || GEP->hasNoUnsignedSignedWrap()) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index dab200d..669d4f0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4003,18 +4003,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Try to fold intrinsic into select/phi operands. This is legal if: // * The intrinsic is speculatable. - // * The select condition is not a vector, or the intrinsic does not - // perform cross-lane operations. - if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) && - isNotCrossLaneOperation(II)) + // * The operand is one of the following: + // - a phi. + // - a select with a scalar condition. + // - a select with a vector condition and II is not a cross lane operation. + if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) { for (Value *Op : II->args()) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) - if (Instruction *R = FoldOpIntoSelect(*II, Sel)) + if (auto *Sel = dyn_cast<SelectInst>(Op)) { + bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy(); + if (IsVectorCond && !isNotCrossLaneOperation(II)) + continue; + // Don't replace a scalar select with a more expensive vector select if + // we can't simplify both arms of the select. + bool SimplifyBothArms = + !Op->getType()->isVectorTy() && II->getType()->isVectorTy(); + if (Instruction *R = FoldOpIntoSelect( + *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms)) return R; + } if (auto *Phi = dyn_cast<PHINode>(Op)) if (Instruction *R = foldOpIntoPhi(*II, Phi)) return R; } + } if (Instruction *Shuf = foldShuffledIntrinsicOperands(II)) return Shuf; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 943c223..ede73f8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -664,7 +664,8 @@ public: /// This also works for Cast instructions, which obviously do not have a /// second operand. Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse = false); + bool FoldWithMultiUse = false, + bool SimplifyBothArms = false); /// This is a convenience wrapper function for the above two functions. Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 3f11cae..67e2aae 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI, } Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse) { + bool FoldWithMultiUse, + bool SimplifyBothArms) { // Don't modify shared select instructions unless set FoldWithMultiUse if (!SI->hasOneUse() && !FoldWithMultiUse) return nullptr; @@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, if (!NewTV && !NewFV) return nullptr; + if (SimplifyBothArms && !(NewTV && NewFV)) + return nullptr; + // Create an instruction for the arm that did not fold. if (!NewTV) NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this); diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 4acc3f2..d347ced 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -614,6 +614,16 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } + if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && + canUseSExt(CI)) { + Preconditions.emplace_back( + CmpInst::ICMP_UGE, Op0, + ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); + if (auto Decomp = MergeResults(Op0, CI, true)) + return *Decomp; + return {V, IsKnownNonNegative}; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) { if (!isKnownNonNegative(Op0, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, @@ -627,16 +637,6 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } - if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && - canUseSExt(CI)) { - Preconditions.emplace_back( - CmpInst::ICMP_UGE, Op0, - ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); - if (auto Decomp = MergeResults(Op0, CI, true)) - return *Decomp; - return {V, IsKnownNonNegative}; - } - // Decompose or as an add if there are no common bits between the operands. if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) { if (auto Decomp = MergeResults(Op0, CI, IsSigned)) diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index a83cbd17a7..f273e9d 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -64,10 +64,10 @@ using namespace llvm; -namespace { - #define DEBUG_TYPE "mergeicmps" +namespace { + // A BCE atom "Binary Compare Expression Atom" represents an integer load // that is a constant offset from a base value, e.g. `a` or `o.c` in the example // at the top. @@ -128,11 +128,12 @@ private: unsigned Order = 1; DenseMap<const Value*, int> BaseToIndex; }; +} // namespace // If this value is a load from a constant offset w.r.t. a base address, and // there are no other users of the load or address, returns the base address and // the offset. -BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { +static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { auto *const LoadI = dyn_cast<LoadInst>(Val); if (!LoadI) return {}; @@ -175,6 +176,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset); } +namespace { // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the // top. // Note: the terminology is misleading: the comparison is symmetric, so there @@ -239,6 +241,7 @@ class BCECmpBlock { private: BCECmp Cmp; }; +} // namespace bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, AliasAnalysis &AA) const { @@ -302,9 +305,9 @@ bool BCECmpBlock::doesOtherWork() const { // Visit the given comparison. If this is a comparison between two valid // BCE atoms, returns the comparison. -std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI, - const ICmpInst::Predicate ExpectedPredicate, - BaseIdentifier &BaseId) { +static std::optional<BCECmp> +visitICmp(const ICmpInst *const CmpI, + const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) { // The comparison can only be used once: // - For intermediate blocks, as a branch condition. // - For the final block, as an incoming value for the Phi. @@ -332,10 +335,9 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI, // Visit the given comparison block. If this is a comparison between two valid // BCE atoms, returns the comparison. -std::optional<BCECmpBlock> visitCmpBlock(Value *const Val, - BasicBlock *const Block, - const BasicBlock *const PhiBlock, - BaseIdentifier &BaseId) { +static std::optional<BCECmpBlock> +visitCmpBlock(Value *const Val, BasicBlock *const Block, + const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) { if (Block->empty()) return std::nullopt; auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); @@ -397,6 +399,7 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, Comparisons.push_back(std::move(Comparison)); } +namespace { // A chain of comparisons. class BCECmpChain { public: @@ -420,6 +423,7 @@ private: // The original entry block (before sorting); BasicBlock *EntryBlock_; }; +} // namespace static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) { return First.Lhs().BaseId == Second.Lhs().BaseId && @@ -742,9 +746,8 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, return true; } -std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, - BasicBlock *const LastBlock, - int NumBlocks) { +static std::vector<BasicBlock *> +getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) { // Walk up from the last block to find other blocks. std::vector<BasicBlock *> Blocks(NumBlocks); assert(LastBlock && "invalid last block"); @@ -777,8 +780,8 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, return Blocks; } -bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, - DomTreeUpdater &DTU) { +static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, + AliasAnalysis &AA, DomTreeUpdater &DTU) { LLVM_DEBUG(dbgs() << "processPhi()\n"); if (Phi.getNumIncomingValues() <= 1) { LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); @@ -874,6 +877,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI, return MadeChange; } +namespace { class MergeICmpsLegacyPass : public FunctionPass { public: static char ID; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 06bea2f..a1ad2db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2712,7 +2712,8 @@ public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; } static inline bool classof(const VPUser *U) { @@ -2783,7 +2784,10 @@ public: Opcode(Opcode), VFScaleFactor(ScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getChainOp()->getDefiningRecipe(); - assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || + // When cloning as part of a VPExpressionRecipe the chain op could have + // replaced by a temporary VPValue, so it doesn't have a defining recipe. + assert((!AccumulatorRecipe || + isa<VPReductionPHIRecipe>(AccumulatorRecipe) || isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); } @@ -3093,6 +3097,11 @@ public: /// removed before codegen. void decompose(); + unsigned getVFScaleFactor() const { + auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back()); + return PR ? PR->getVFScaleFactor() : 1; + } + /// Method for generating code, must not be called as this recipe is abstract. void execute(VPTransformState &State) override { llvm_unreachable("recipe must be removed before execute"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1f1b42b..931a5b7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: + case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -300,14 +301,23 @@ InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional<unsigned> Opcode; - VPValue *Op = getOperand(0); - VPRecipeBase *OpR = Op->getDefiningRecipe(); - - // If the partial reduction is predicated, a select will be operand 0 - if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { - OpR = Op->getDefiningRecipe(); + VPValue *Op = getVecOp(); + uint64_t MulConst; + // If the partial reduction is predicated, a select will be operand 1. + // If it isn't predicated and the mul isn't operating on a constant, then it + // should have been turned into a VPExpressionRecipe. + // FIXME: Replace the entire function with this once all partial reduction + // variants are bundled into VPExpressionRecipe. + if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && + !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); + auto *InputType = Ctx.Types.inferScalarType(getVecOp()); + return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, + PhiType, VF, TTI::PR_None, + TTI::PR_None, {}, Ctx.CostKind); } + VPRecipeBase *OpR = Op->getDefiningRecipe(); Type *InputTypeA = nullptr, *InputTypeB = nullptr; TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, ExtBType = TTI::PR_None; @@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - return Ctx.TTI.getExtendedReductionCost( - Opcode, - cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == - Instruction::ZExt, - RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); + auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); + return isa<VPPartialReductionRecipe>(ExpressionRecipes.back()) + ? Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, + RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + ExtR->getOpcode()), + TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind) + : Ctx.TTI.getExtendedReductionCost( + Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, + SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, @@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, Opcode = Instruction::Sub; [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { + if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) { + auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); + auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); + auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); + return Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), + Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + Ext0R->getOpcode()), + TargetTransformInfo::getPartialReductionExtendKind( + Ext1R->getOpcode()), + Mul->getOpcode(), Ctx.CostKind); + } return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, @@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { getOperand(1)->printAsOperand(O, SlotTracker); - O << " +"; - O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName(Opcode) << " ("; getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); @@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, } case ExpressionTypes::ExtNegatedMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " (sub (0, mul"; auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); @@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + "; - O << "reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " ("; O << "mul"; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5a3af4..3e85e6f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3519,18 +3519,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VPValue *VecOp = Red->getVecOp(); // Clamp the range if using extended-reduction is profitable. - auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, - Type *SrcTy) -> bool { + auto IsExtendedRedValidAndClampRange = + [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( - Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), - CostKind); + + InstructionCost ExtRedCost; InstructionCost ExtCost = cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); + + if (isa<VPPartialReductionRecipe>(Red)) { + TargetTransformInfo::PartialReductionExtendKind ExtKind = + TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + ExtRedCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, nullptr, RedTy, VF, ExtKind, + llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind); + } else { + ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy, + Red->getFastMathFlags(), CostKind); + } return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; }, Range); @@ -3541,8 +3554,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && IsExtendedRedValidAndClampRange( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), - cast<VPWidenCastRecipe>(VecOp)->getOpcode() == - Instruction::CastOps::ZExt, + cast<VPWidenCastRecipe>(VecOp)->getOpcode(), Ctx.Types.inferScalarType(A))) return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red); @@ -3560,6 +3572,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { + bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red); + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; @@ -3568,16 +3582,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Clamp the range if using multiply-accumulate-reduction is profitable. auto IsMulAccValidAndClampRange = - [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, - VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenCastRecipe *OuterExt) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; - auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( - isZExt, Opcode, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost; + + if (IsPartialReduction) { + Type *SrcTy2 = + Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + MulAccCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, SrcTy2, RedTy, VF, + Ext0 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext0->getOpcode()) + : TargetTransformInfo::PR_None, + Ext1 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext1->getOpcode()) + : TargetTransformInfo::PR_None, + Mul->getOpcode(), CostKind); + } else { + // Only partial reductions support mixed extends at the moment. + if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode()) + return false; + + bool IsZExt = + !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt; + auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); + MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy, + SrcVecTy, CostKind); + } + InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3611,14 +3650,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); - // Match reduce.add(mul(ext, ext)). - if (RecipeA && RecipeB && - (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && - match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + // Match reduce.add/sub(mul(ext, ext)). + if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && - IsMulAccValidAndClampRange(RecipeA->getOpcode() == - Instruction::CastOps::ZExt, - Mul, RecipeA, RecipeB, nullptr)) { + IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) { if (Sub) return new VPExpressionRecipe(RecipeA, RecipeB, Mul, cast<VPWidenRecipe>(Sub), Red); @@ -3626,8 +3661,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, } // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul - if (!Sub && - IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); } // TODO: Add an expression type for negated versions of other expression @@ -3647,9 +3681,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && - IsMulAccValidAndClampRange(Ext0->getOpcode() == - Instruction::CastOps::ZExt, - Mul, Ext0, Ext1, Ext)) { + IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0, *Ext0, Ext0->getDebugLoc()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 32e4b88..06c3d75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -151,6 +151,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { return RR->getVFScaleFactor(); if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R)) return RR->getVFScaleFactor(); + if (auto *ER = dyn_cast<VPExpressionRecipe>(R)) + return ER->getVFScaleFactor(); assert( (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() != VPInstruction::ReductionStartVector) && |