aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp19
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp8
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp38
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp89
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp26
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp65
11 files changed, 195 insertions, 58 deletions
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 042578d..6a11aec 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
if (CB->isIndirectCall()) {
FunctionType *FTy = CB->getFunctionType();
if (FTy->isVarArg())
- Changed |= expandCall(M, Builder, CB, FTy, 0);
+ Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr);
}
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8d9933b..92fca90 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
if (isPowerOf2_64(AlignMask + 1)) {
uint64_t Offset = 0;
match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
- if (match(A, m_PtrToInt(m_Value(A)))) {
+ if (match(A, m_PtrToIntOrAddr(m_Value(A)))) {
/// Note: this doesn't preserve the offset information but merges
/// offset and alignment.
/// TODO: we can generate a GEP instead of merging the alignment with
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f939e7a..614c6eb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
return nullptr;
}
-Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) {
+Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) {
// Look through chain of one-use GEPs.
Type *PtrTy = Ptr->getType();
SmallVector<GEPOperator *> GEPs;
@@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
Mask->getType() == Ty)
return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
- if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp))
+ if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
return replaceInstUsesWith(CI, V);
Value *Vec, *Scalar, *Index;
@@ -2228,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
}
Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+ Value *SrcOp = CI.getPointerOperand();
+ Type *Ty = CI.getType();
+
+ // (ptrtoaddr (ptrmask P, M))
+ // -> (and (ptrtoaddr P), M)
+ // This is generally beneficial as `and` is better supported than `ptrmask`.
+ Value *Ptr, *Mask;
+ if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+ m_Value(Mask)))) &&
+ Mask->getType() == Ty)
+ return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask);
+
+ if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
+ return replaceInstUsesWith(CI, V);
+
// FIXME: Implement variants of ptrtoint folds.
return commonCastTransforms(CI);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9c75d9a..d85e4f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -700,7 +700,7 @@ public:
/// folded operation.
void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
- Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr);
+ Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr);
Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
Instruction &I);
Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 5ba2167..cc53ec2 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
BasicBlock::iterator Pos) {
IRBuilder<> IRB(Pos->getParent(), Pos);
- Value *ShadowOffset = getShadowOffset(Addr, IRB);
- return getShadowAddress(Addr, Pos, ShadowOffset);
+ Value *ShadowAddr = getShadowOffset(Addr, IRB);
+ uint64_t ShadowBase = MapParams->ShadowBase;
+ if (ShadowBase != 0)
+ ShadowAddr =
+ IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase));
+ return getShadowAddress(Addr, Pos, ShadowAddr);
}
Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index a6ec6c1..2f256df 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -216,7 +216,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
}
LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
<< Reason << ".\n");
- return;
}
struct AllocMatchInfo {
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 66e45ec..e84ca81 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -122,16 +122,22 @@ static cl::opt<unsigned>
cl::desc("Maximum cost accepted for the transformation"),
cl::Hidden, cl::init(50));
-extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
-} // namespace llvm
-
static cl::opt<double> MaxClonedRate(
"dfa-max-cloned-rate",
cl::desc(
"Maximum cloned instructions rate accepted for the transformation"),
cl::Hidden, cl::init(7.5));
+static cl::opt<unsigned>
+ MaxOuterUseBlocks("dfa-max-out-use-blocks",
+ cl::desc("Maximum unduplicated blocks with outer uses "
+ "accepted for the transformation"),
+ cl::Hidden, cl::init(40));
+
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class SelectInstToUnfold {
SelectInst *SI;
@@ -965,8 +971,16 @@ private:
// SLPVectorizer.
// TODO: Thread the switch partially before reaching the threshold.
uint64_t NumOrigInst = 0;
- for (auto *BB : DuplicateMap.keys())
+ uint64_t NumOuterUseBlock = 0;
+ for (auto *BB : DuplicateMap.keys()) {
NumOrigInst += BB->sizeWithoutDebug();
+ // Only unduplicated blocks with single predecessor require new phi
+ // nodes.
+ for (auto *Succ : successors(BB))
+ if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor())
+ NumOuterUseBlock++;
+ }
+
if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {
LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
"instructions wll be cloned\n");
@@ -977,6 +991,20 @@ private:
return false;
}
+ // Too much unduplicated blocks with outer uses may cause too much
+ // insertions of phi nodes for duplicated definitions. TODO: Drop this
+ // threshold if we come up with another way to reduce the number of inserted
+ // phi nodes.
+ if (NumOuterUseBlock > MaxOuterUseBlocks) {
+ LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
+ "blocks with outer uses\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+ << "Too much blocks with outer uses.";
+ });
+ return false;
+ }
+
InstructionCost DuplicationCost = 0;
unsigned JumpTableSize = 0;
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index a9ab3b3..27fed73 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -809,7 +809,6 @@ public:
void emitInstructionAnnot(const Instruction *I,
formatted_raw_ostream &OS) override {
if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
- OS << "; Has predicate info\n";
if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
<< " Comparison:" << *PB->Condition << " Edge: [";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c537be5c..b03fb62 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
// If either of the blocks has it's address taken, then we can't do this fold,
// because the code we'd hoist would no longer run when we jump into the block
// by it's address.
- for (auto *Succ : successors(BB))
- if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+ for (auto *Succ : successors(BB)) {
+ if (Succ->hasAddressTaken())
return false;
-
+ if (Succ->getSinglePredecessor())
+ continue;
+ // If Succ has >1 predecessors, continue to check if the Succ contains only
+ // one `unreachable` inst. Since executing `unreachable` inst is an UB, we
+ // can relax the condition based on the assumptiom that the program would
+ // never enter Succ and trigger such an UB.
+ if (isa<UnreachableInst>(*Succ->begin()))
+ continue;
+ return false;
+ }
// The second of pair is a SkipFlags bitmask.
using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
SmallVector<SuccIterPair, 8> SuccIterPairs;
@@ -5228,32 +5237,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
}
- // Create the new switch instruction now.
- SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
- if (HasProfile) {
- // We know the weight of the default case. We don't know the weight of the
- // other cases, but rather than completely lose profiling info, we split
- // the remaining probability equally over them.
- SmallVector<uint32_t> NewWeights(Values.size() + 1);
- NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
- // TrueWhenEqual.
- for (auto &V : drop_begin(NewWeights))
- V = BranchWeights[0] / Values.size();
- setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
- }
-
- // Add all of the 'cases' to the switch instruction.
- for (ConstantInt *Val : Values)
- New->addCase(Val, EdgeBB);
+ // Check if we can represent the values as a contiguous range. If so, we use a
+ // range check + conditional branch instead of a switch.
+ if (Values.front()->getValue() - Values.back()->getValue() ==
+ Values.size() - 1) {
+ ConstantRange RangeToCheck = ConstantRange::getNonEmpty(
+ Values.back()->getValue(), Values.front()->getValue() + 1);
+ APInt Offset, RHS;
+ ICmpInst::Predicate Pred;
+ RangeToCheck.getEquivalentICmp(Pred, RHS, Offset);
+ Value *X = CompVal;
+ if (!Offset.isZero())
+ X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset));
+ Value *Cond =
+ Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS));
+ BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
+ if (HasProfile)
+ setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false);
+ // We don't need to update PHI nodes since we don't add any new edges.
+ } else {
+ // Create the new switch instruction now.
+ SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+ if (HasProfile) {
+ // We know the weight of the default case. We don't know the weight of the
+ // other cases, but rather than completely lose profiling info, we split
+ // the remaining probability equally over them.
+ SmallVector<uint32_t> NewWeights(Values.size() + 1);
+ NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped
+ // if TrueWhenEqual.
+ for (auto &V : drop_begin(NewWeights))
+ V = BranchWeights[0] / Values.size();
+ setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+ }
- // We added edges from PI to the EdgeBB. As such, if there were any
- // PHI nodes in EdgeBB, they need entries to be added corresponding to
- // the number of edges added.
- for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
- PHINode *PN = cast<PHINode>(BBI);
- Value *InVal = PN->getIncomingValueForBlock(BB);
- for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
- PN->addIncoming(InVal, BB);
+ // Add all of the 'cases' to the switch instruction.
+ for (ConstantInt *Val : Values)
+ New->addCase(Val, EdgeBB);
+
+ // We added edges from PI to the EdgeBB. As such, if there were any
+ // PHI nodes in EdgeBB, they need entries to be added corresponding to
+ // the number of edges added.
+ for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+ PHINode *PN = cast<PHINode>(BBI);
+ Value *InVal = PN->getIncomingValueForBlock(BB);
+ for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+ PN->addIncoming(InVal, BB);
+ }
}
// Erase the old branch instruction.
@@ -7603,7 +7632,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
auto *DefaultCaseBB = SI->getDefaultDest();
BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
auto It = OrigBB->getTerminator()->getIterator();
- BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ // BI is handling the default case for SI, and so should share its DebugLoc.
+ BI->setDebugLoc(SI->getDebugLoc());
It->eraseFromParent();
addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4fcaf6d..1b55a3b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5608,6 +5608,7 @@ private:
for (ScheduleBundle *Bundle : Bundles) {
if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
break;
+ SmallPtrSet<Value *, 4> ParentsUniqueUsers;
// Need to search for the lane since the tree entry can be
// reordered.
auto *It = find(Bundle->getTreeEntry()->Scalars, In);
@@ -5636,6 +5637,22 @@ private:
Bundle->getTreeEntry()->isCopyableElement(In)) &&
"Missed TreeEntry operands?");
+ bool IsNonSchedulableWithParentPhiNode =
+ Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
+ Bundle->getTreeEntry()->UserTreeIndex &&
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
+ Instruction::PHI;
+ // Count the number of unique phi nodes, which are the parent for
+ // parent entry, and exit, if all the unique phis are processed.
+ if (IsNonSchedulableWithParentPhiNode) {
+ const TreeEntry *ParentTE =
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE;
+ Value *User = ParentTE->Scalars[Lane];
+ if (!ParentsUniqueUsers.insert(User).second)
+ break;
+ }
+
for (unsigned OpIdx :
seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
if (auto *I = dyn_cast<Instruction>(
@@ -5644,8 +5661,8 @@ private:
<< *I << "\n");
DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
}
- // If parent node is schedulable, it will be handle correctly.
- if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+ // If parent node is schedulable, it will be handled correctly.
+ if (!IsNonSchedulableWithParentPhiNode)
break;
It = std::find(std::next(It),
Bundle->getTreeEntry()->Scalars.end(), In);
@@ -16903,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// otherwise TEPtr depends on TE.
if ((TEInsertBlock != InsertPt->getParent() ||
TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
- !CheckOrdering(InsertPt))
+ (!CheckOrdering(InsertPt) ||
+ (UseEI.UserTE->hasCopyableElements() &&
+ isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
+ is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
continue;
// The node is reused - exit.
if (CheckAndUseSameNode(TEPtr))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acad795..4d98014 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Sub = VecOp->getDefiningRecipe();
VecOp = Tmp;
}
+
+ // If ValB is a constant and can be safely extended, truncate it to the same
+ // type as ExtA's operand, then extend it to the same type as ExtA. This
+ // creates two uniform extends that can more easily be matched by the rest of
+ // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+ // replaced with the new extend of the constant.
+ auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+ VPWidenCastRecipe *&ExtB,
+ VPValue *&ValB, VPWidenRecipe *Mul) {
+ if (!ExtA || ExtB || !ValB->isLiveIn())
+ return;
+ Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+ Instruction::CastOps ExtOpc = ExtA->getOpcode();
+ const APInt *Const;
+ if (!match(ValB, m_APInt(Const)) ||
+ !llvm::canConstantBeExtended(
+ Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+ return;
+ // The truncate ensures that the type of each extended operand is the
+ // same, and it's been proven that the constant can be extended from
+ // NarrowTy safely. Necessary since ExtA's extended operand would be
+ // e.g. an i8, while the const will likely be an i32. This will be
+ // elided by later optimisations.
+ VPBuilder Builder(Mul);
+ auto *Trunc =
+ Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+ Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+ ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+ Mul->setOperand(1, ExtB);
+ };
+
// Try to match reduce.add(mul(...)).
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
auto *RecipeA =
@@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+ // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+ ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
// Match reduce.add/sub(mul(ext, ext)).
if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
cast<VPWidenRecipe>(Sub), Red);
return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
}
- // Match reduce.add(mul).
// TODO: Add an expression type for this variant with a negated mul
if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
return new VPExpressionRecipe(Mul, Red);
@@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// variants.
if (Sub)
return nullptr;
- // Match reduce.add(ext(mul(ext(A), ext(B)))).
- // All extend recipes must have same opcode or A == B
- // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
- if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
- m_ZExtOrSExt(m_VPValue()))))) {
+
+ // Match reduce.add(ext(mul(A, B))).
+ if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
- auto *Ext0 =
- cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
- auto *Ext1 =
- cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
- if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+ auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+ auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+ // reduce.add(ext(mul(ext, const)))
+ // -> reduce.add(ext(mul(ext, ext(const))))
+ ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+ // reduce.add(ext(mul(ext(A), ext(B))))
+ // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+ // The inner extends must either have the same opcode as the outer extend or
+ // be the same, in which case the multiply can never result in a negative
+ // value and the outer extend can be folded away by doing wider
+ // extends for the operands of the mul.
+ if (Ext0 && Ext1 &&
+ (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
Ext0->getOpcode() == Ext1->getOpcode() &&
IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
auto *NewExt0 = new VPWidenCastRecipe(
@@ -4021,7 +4062,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
DenseMap<const SCEV *, Value *>
VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
const DataLayout &DL = SE.getDataLayout();
- SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
+ SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
BasicBlock *EntryBB = Entry->getIRBasicBlock();