aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp31
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp25
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp23
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/MergeICmps.cpp34
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp67
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp80
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
11 files changed, 206 insertions, 98 deletions
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index a0f7ec6..2dd0fde 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -948,17 +948,17 @@ void llvm::updateVCallVisibilityInIndex(
// linker, as we have no information on their eventual use.
if (DynamicExportSymbols.count(P.first))
continue;
+ // With validation enabled, we want to exclude symbols visible to regular
+ // objects. Local symbols will be in this group due to the current
+ // implementation but those with VCallVisibilityTranslationUnit will have
+ // already been marked in clang so are unaffected.
+ if (VisibleToRegularObjSymbols.count(P.first))
+ continue;
for (auto &S : P.second.getSummaryList()) {
auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
if (!GVar ||
GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
continue;
- // With validation enabled, we want to exclude symbols visible to regular
- // objects. Local symbols will be in this group due to the current
- // implementation but those with VCallVisibilityTranslationUnit will have
- // already been marked in clang so are unaffected.
- if (VisibleToRegularObjSymbols.count(P.first))
- continue;
GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
}
}
@@ -1161,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets(
// and therefore the same GUID. This can happen if there isn't enough
// distinguishing path when compiling the source file. In that case we
// conservatively return false early.
+ if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1)
+ return false;
const GlobalVarSummary *VS = nullptr;
- bool LocalFound = false;
for (const auto &S : P.VTableVI.getSummaryList()) {
- if (GlobalValue::isLocalLinkage(S->linkage())) {
- if (LocalFound)
- return false;
- LocalFound = true;
- }
auto *CurVS = cast<GlobalVarSummary>(S->getBaseObject());
if (!CurVS->vTableFuncs().empty() ||
// Previously clang did not attach the necessary type metadata to
@@ -1184,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets(
// with public LTO visibility.
if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
return false;
+ break;
}
}
// There will be no VS if all copies are available_externally having no
@@ -1411,9 +1408,8 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
// If the summary list contains multiple summaries where at least one is
// a local, give up, as we won't know which (possibly promoted) name to use.
- for (const auto &S : TheFn.getSummaryList())
- if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
- return false;
+ if (TheFn.hasLocal() && Size > 1)
+ return false;
// Collect functions devirtualized at least for one call site for stats.
if (PrintSummaryDevirt || AreStatisticsEnabled())
@@ -2591,6 +2587,11 @@ void DevirtIndex::run() {
if (ExportSummary.typeIdCompatibleVtableMap().empty())
return;
+ // Assert that we haven't made any changes that would affect the hasLocal()
+ // flag on the GUID summary info.
+ assert(!ExportSummary.withInternalizeAndPromote() &&
+ "Expect index-based WPD to run before internalization and promotion");
+
DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 73ec451..9bee523 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2760,21 +2760,34 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
// Optimize pointer differences into the same array into a size. Consider:
// &A[10] - &A[0]: we should compile this to "10".
Value *LHSOp, *RHSOp;
- if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
- match(Op1, m_PtrToInt(m_Value(RHSOp))))
+ if (match(Op0, m_PtrToIntOrAddr(m_Value(LHSOp))) &&
+ match(Op1, m_PtrToIntOrAddr(m_Value(RHSOp))))
if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
I.hasNoUnsignedWrap()))
return replaceInstUsesWith(I, Res);
// trunc(p)-trunc(q) -> trunc(p-q)
- if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
- match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+ if (match(Op0, m_Trunc(m_PtrToIntOrAddr(m_Value(LHSOp)))) &&
+ match(Op1, m_Trunc(m_PtrToIntOrAddr(m_Value(RHSOp)))))
if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
/* IsNUW */ false))
return replaceInstUsesWith(I, Res);
- if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
- match(Op1, m_ZExtOrSelf(m_PtrToInt(m_Value(RHSOp))))) {
+ auto MatchSubOfZExtOfPtrToIntOrAddr = [&]() {
+ if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+ match(Op1, m_ZExt(m_PtrToIntSameSize(DL, m_Value(RHSOp)))))
+ return true;
+ if (match(Op0, m_ZExt(m_PtrToAddr(m_Value(LHSOp)))) &&
+ match(Op1, m_ZExt(m_PtrToAddr(m_Value(RHSOp)))))
+ return true;
+ // Special case for non-canonical ptrtoint in constant expression,
+ // where the zext has been folded into the ptrtoint.
+ if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+ match(Op1, m_PtrToInt(m_Value(RHSOp))))
+ return true;
+ return false;
+ };
+ if (MatchSubOfZExtOfPtrToIntOrAddr()) {
if (auto *GEP = dyn_cast<GEPOperator>(LHSOp)) {
if (GEP->getPointerOperand() == RHSOp) {
if (GEP->hasNoUnsignedWrap() || GEP->hasNoUnsignedSignedWrap()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dab200d..669d4f0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4003,18 +4003,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
// Try to fold intrinsic into select/phi operands. This is legal if:
// * The intrinsic is speculatable.
- // * The select condition is not a vector, or the intrinsic does not
- // perform cross-lane operations.
- if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) &&
- isNotCrossLaneOperation(II))
+ // * The operand is one of the following:
+ // - a phi.
+ // - a select with a scalar condition.
+ // - a select with a vector condition and II is not a cross lane operation.
+ if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) {
for (Value *Op : II->args()) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
- if (Instruction *R = FoldOpIntoSelect(*II, Sel))
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+ bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy();
+ if (IsVectorCond && !isNotCrossLaneOperation(II))
+ continue;
+ // Don't replace a scalar select with a more expensive vector select if
+ // we can't simplify both arms of the select.
+ bool SimplifyBothArms =
+ !Op->getType()->isVectorTy() && II->getType()->isVectorTy();
+ if (Instruction *R = FoldOpIntoSelect(
+ *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms))
return R;
+ }
if (auto *Phi = dyn_cast<PHINode>(Op))
if (Instruction *R = foldOpIntoPhi(*II, Phi))
return R;
}
+ }
if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
return Shuf;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 943c223..ede73f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -664,7 +664,8 @@ public:
/// This also works for Cast instructions, which obviously do not have a
/// second operand.
Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse = false);
+ bool FoldWithMultiUse = false,
+ bool SimplifyBothArms = false);
/// This is a convenience wrapper function for the above two functions.
Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3f11cae..67e2aae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
}
Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse) {
+ bool FoldWithMultiUse,
+ bool SimplifyBothArms) {
// Don't modify shared select instructions unless set FoldWithMultiUse
if (!SI->hasOneUse() && !FoldWithMultiUse)
return nullptr;
@@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
if (!NewTV && !NewFV)
return nullptr;
+ if (SimplifyBothArms && !(NewTV && NewFV))
+ return nullptr;
+
// Create an instruction for the arm that did not fold.
if (!NewTV)
NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this);
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 4acc3f2..d347ced 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -614,6 +614,16 @@ static Decomposition decompose(Value *V,
return {V, IsKnownNonNegative};
}
+ if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
+ canUseSExt(CI)) {
+ Preconditions.emplace_back(
+ CmpInst::ICMP_UGE, Op0,
+ ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
+ if (auto Decomp = MergeResults(Op0, CI, true))
+ return *Decomp;
+ return {V, IsKnownNonNegative};
+ }
+
if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
if (!isKnownNonNegative(Op0, DL))
Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
@@ -627,16 +637,6 @@ static Decomposition decompose(Value *V,
return {V, IsKnownNonNegative};
}
- if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
- canUseSExt(CI)) {
- Preconditions.emplace_back(
- CmpInst::ICMP_UGE, Op0,
- ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
- if (auto Decomp = MergeResults(Op0, CI, true))
- return *Decomp;
- return {V, IsKnownNonNegative};
- }
-
// Decompose or as an add if there are no common bits between the operands.
if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) {
if (auto Decomp = MergeResults(Op0, CI, IsSigned))
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index a83cbd17a7..f273e9d 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -64,10 +64,10 @@
using namespace llvm;
-namespace {
-
#define DEBUG_TYPE "mergeicmps"
+namespace {
+
// A BCE atom "Binary Compare Expression Atom" represents an integer load
// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
// at the top.
@@ -128,11 +128,12 @@ private:
unsigned Order = 1;
DenseMap<const Value*, int> BaseToIndex;
};
+} // namespace
// If this value is a load from a constant offset w.r.t. a base address, and
// there are no other users of the load or address, returns the base address and
// the offset.
-BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
+static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
auto *const LoadI = dyn_cast<LoadInst>(Val);
if (!LoadI)
return {};
@@ -175,6 +176,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset);
}
+namespace {
// A comparison between two BCE atoms, e.g. `a == o.a` in the example at the
// top.
// Note: the terminology is misleading: the comparison is symmetric, so there
@@ -239,6 +241,7 @@ class BCECmpBlock {
private:
BCECmp Cmp;
};
+} // namespace
bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
AliasAnalysis &AA) const {
@@ -302,9 +305,9 @@ bool BCECmpBlock::doesOtherWork() const {
// Visit the given comparison. If this is a comparison between two valid
// BCE atoms, returns the comparison.
-std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
- const ICmpInst::Predicate ExpectedPredicate,
- BaseIdentifier &BaseId) {
+static std::optional<BCECmp>
+visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) {
// The comparison can only be used once:
// - For intermediate blocks, as a branch condition.
// - For the final block, as an incoming value for the Phi.
@@ -332,10 +335,9 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
// Visit the given comparison block. If this is a comparison between two valid
// BCE atoms, returns the comparison.
-std::optional<BCECmpBlock> visitCmpBlock(Value *const Val,
- BasicBlock *const Block,
- const BasicBlock *const PhiBlock,
- BaseIdentifier &BaseId) {
+static std::optional<BCECmpBlock>
+visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) {
if (Block->empty())
return std::nullopt;
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
@@ -397,6 +399,7 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
Comparisons.push_back(std::move(Comparison));
}
+namespace {
// A chain of comparisons.
class BCECmpChain {
public:
@@ -420,6 +423,7 @@ private:
// The original entry block (before sorting);
BasicBlock *EntryBlock_;
};
+} // namespace
static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) {
return First.Lhs().BaseId == Second.Lhs().BaseId &&
@@ -742,9 +746,8 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
return true;
}
-std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
- BasicBlock *const LastBlock,
- int NumBlocks) {
+static std::vector<BasicBlock *>
+getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) {
// Walk up from the last block to find other blocks.
std::vector<BasicBlock *> Blocks(NumBlocks);
assert(LastBlock && "invalid last block");
@@ -777,8 +780,8 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
return Blocks;
}
-bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
- DomTreeUpdater &DTU) {
+static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA, DomTreeUpdater &DTU) {
LLVM_DEBUG(dbgs() << "processPhi()\n");
if (Phi.getNumIncomingValues() <= 1) {
LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
@@ -874,6 +877,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
return MadeChange;
}
+namespace {
class MergeICmpsLegacyPass : public FunctionPass {
public:
static char ID;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 06bea2f..a1ad2db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2712,7 +2712,8 @@ public:
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2783,7 +2784,10 @@ public:
Opcode(Opcode), VFScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getChainOp()->getDefiningRecipe();
- assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+ // When cloning as part of a VPExpressionRecipe the chain op could have
+ // replaced by a temporary VPValue, so it doesn't have a defining recipe.
+ assert((!AccumulatorRecipe ||
+ isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
@@ -3093,6 +3097,11 @@ public:
/// removed before codegen.
void decompose();
+ unsigned getVFScaleFactor() const {
+ auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back());
+ return PR ? PR->getVFScaleFactor() : 1;
+ }
+
/// Method for generating code, must not be called as this recipe is abstract.
void execute(VPTransformState &State) override {
llvm_unreachable("recipe must be removed before execute");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f1b42b..931a5b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
+ case VPPartialReductionSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
@@ -300,14 +301,23 @@ InstructionCost
VPPartialReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
std::optional<unsigned> Opcode;
- VPValue *Op = getOperand(0);
- VPRecipeBase *OpR = Op->getDefiningRecipe();
-
- // If the partial reduction is predicated, a select will be operand 0
- if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
- OpR = Op->getDefiningRecipe();
+ VPValue *Op = getVecOp();
+ uint64_t MulConst;
+ // If the partial reduction is predicated, a select will be operand 1.
+ // If it isn't predicated and the mul isn't operating on a constant, then it
+ // should have been turned into a VPExpressionRecipe.
+ // FIXME: Replace the entire function with this once all partial reduction
+ // variants are bundled into VPExpressionRecipe.
+ if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
+ !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
+ auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
+ auto *InputType = Ctx.Types.inferScalarType(getVecOp());
+ return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
+ PhiType, VF, TTI::PR_None,
+ TTI::PR_None, {}, Ctx.CostKind);
}
+ VPRecipeBase *OpR = Op->getDefiningRecipe();
Type *InputTypeA = nullptr, *InputTypeB = nullptr;
TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
ExtBType = TTI::PR_None;
@@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
- return Ctx.TTI.getExtendedReductionCost(
- Opcode,
- cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
- Instruction::ZExt,
- RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
+ auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+ return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
+ ? Ctx.TTI.getPartialReductionCost(
+ Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
+ RedTy, VF,
+ TargetTransformInfo::getPartialReductionExtendKind(
+ ExtR->getOpcode()),
+ TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
+ : Ctx.TTI.getExtendedReductionCost(
+ Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
+ SrcVecTy, std::nullopt, Ctx.CostKind);
}
case ExpressionTypes::MulAccReduction:
return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
@@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
Opcode = Instruction::Sub;
[[fallthrough]];
case ExpressionTypes::ExtMulAccReduction: {
+ if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
+ auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+ auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+ auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+ return Ctx.TTI.getPartialReductionCost(
+ Opcode, Ctx.Types.inferScalarType(getOperand(0)),
+ Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext0R->getOpcode()),
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext1R->getOpcode()),
+ Mul->getOpcode(), Ctx.CostKind);
+ }
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
@@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " = ";
auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
getOperand(1)->printAsOperand(O, SlotTracker);
- O << " +";
- O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(Opcode) << " (";
getOperand(0)->printAsOperand(O, SlotTracker);
Red->printFlags(O);
@@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
}
case ExpressionTypes::ExtNegatedMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
- O << " + reduce."
- << Instruction::getOpcodeName(
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (sub (0, mul";
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
@@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
case ExpressionTypes::MulAccReduction:
case ExpressionTypes::ExtMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
- O << " + ";
- O << "reduce."
- << Instruction::getOpcodeName(
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (";
O << "mul";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f5a3af4..3e85e6f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3519,18 +3519,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
VPValue *VecOp = Red->getVecOp();
// Clamp the range if using extended-reduction is profitable.
- auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
- Type *SrcTy) -> bool {
+ auto IsExtendedRedValidAndClampRange =
+ [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
return LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
- Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
- CostKind);
+
+ InstructionCost ExtRedCost;
InstructionCost ExtCost =
cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
+
+ if (isa<VPPartialReductionRecipe>(Red)) {
+ TargetTransformInfo::PartialReductionExtendKind ExtKind =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtOpc);
+ // FIXME: Move partial reduction creation, costing and clamping
+ // here from LoopVectorize.cpp.
+ ExtRedCost = Ctx.TTI.getPartialReductionCost(
+ Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
+ llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
+ } else {
+ ExtRedCost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
+ Red->getFastMathFlags(), CostKind);
+ }
return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
},
Range);
@@ -3541,8 +3554,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
IsExtendedRedValidAndClampRange(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
- cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
- Instruction::CastOps::ZExt,
+ cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
Ctx.Types.inferScalarType(A)))
return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
@@ -3560,6 +3572,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
+
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return nullptr;
@@ -3568,16 +3582,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// Clamp the range if using multiply-accumulate-reduction is profitable.
auto IsMulAccValidAndClampRange =
- [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
- VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
+ [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenCastRecipe *OuterExt) -> bool {
return LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *SrcTy =
Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
- auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
- InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
- isZExt, Opcode, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulAccCost;
+
+ if (IsPartialReduction) {
+ Type *SrcTy2 =
+ Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
+ // FIXME: Move partial reduction creation, costing and clamping
+ // here from LoopVectorize.cpp.
+ MulAccCost = Ctx.TTI.getPartialReductionCost(
+ Opcode, SrcTy, SrcTy2, RedTy, VF,
+ Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
+ Ext0->getOpcode())
+ : TargetTransformInfo::PR_None,
+ Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
+ Ext1->getOpcode())
+ : TargetTransformInfo::PR_None,
+ Mul->getOpcode(), CostKind);
+ } else {
+ // Only partial reductions support mixed extends at the moment.
+ if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
+ return false;
+
+ bool IsZExt =
+ !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
+ SrcVecTy, CostKind);
+ }
+
InstructionCost MulCost = Mul->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
InstructionCost ExtCost = 0;
@@ -3611,14 +3650,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
- // Match reduce.add(mul(ext, ext)).
- if (RecipeA && RecipeB &&
- (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
- match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
+ // Match reduce.add/sub(mul(ext, ext)).
+ if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
- IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
- Instruction::CastOps::ZExt,
- Mul, RecipeA, RecipeB, nullptr)) {
+ IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
if (Sub)
return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
cast<VPWidenRecipe>(Sub), Red);
@@ -3626,8 +3661,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
}
// Match reduce.add(mul).
// TODO: Add an expression type for this variant with a negated mul
- if (!Sub &&
- IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+ if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
return new VPExpressionRecipe(Mul, Red);
}
// TODO: Add an expression type for negated versions of other expression
@@ -3647,9 +3681,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
Ext0->getOpcode() == Ext1->getOpcode() &&
- IsMulAccValidAndClampRange(Ext0->getOpcode() ==
- Instruction::CastOps::ZExt,
- Mul, Ext0, Ext1, Ext)) {
+ IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
auto *NewExt0 = new VPWidenCastRecipe(
Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
*Ext0, Ext0->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 32e4b88..06c3d75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -151,6 +151,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
return RR->getVFScaleFactor();
+ if (auto *ER = dyn_cast<VPExpressionRecipe>(R))
+ return ER->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
VPInstruction::ReductionStartVector) &&