diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/IPO/ExpandVariadics.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 19 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineInternal.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Instrumentation/MemProfUse.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 38 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/PredicateInfo.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 89 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 26 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 65 |
11 files changed, 195 insertions, 58 deletions
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index 042578d..6a11aec 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) { if (CB->isIndirectCall()) { FunctionType *FTy = CB->getFunctionType(); if (FTy->isVarArg()) - Changed |= expandCall(M, Builder, CB, FTy, 0); + Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr); } } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8d9933b..92fca90 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (isPowerOf2_64(AlignMask + 1)) { uint64_t Offset = 0; match(A, m_Add(m_Value(A), m_ConstantInt(Offset))); - if (match(A, m_PtrToInt(m_Value(A)))) { + if (match(A, m_PtrToIntOrAddr(m_Value(A)))) { /// Note: this doesn't preserve the offset information but merges /// offset and alignment. /// TODO: we can generate a GEP instead of merging the alignment with diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index f939e7a..614c6eb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { return nullptr; } -Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) { +Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) { // Look through chain of one-use GEPs. Type *PtrTy = Ptr->getType(); SmallVector<GEPOperator *> GEPs; @@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { Mask->getType() == Ty) return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask); - if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp)) + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) return replaceInstUsesWith(CI, V); Value *Vec, *Scalar, *Index; @@ -2228,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { } Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { + Value *SrcOp = CI.getPointerOperand(); + Type *Ty = CI.getType(); + + // (ptrtoaddr (ptrmask P, M)) + // -> (and (ptrtoaddr P), M) + // This is generally beneficial as `and` is better supported than `ptrmask`. + Value *Ptr, *Mask; + if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr), + m_Value(Mask)))) && + Mask->getType() == Ty) + return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask); + + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) + return replaceInstUsesWith(CI, V); + // FIXME: Implement variants of ptrtoint folds. return commonCastTransforms(CI); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9c75d9a..d85e4f7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -700,7 +700,7 @@ public: /// folded operation. void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); - Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr); + Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr); Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond, Instruction &I); Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS, diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 5ba2167..cc53ec2 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Value *DataFlowSanitizer::getShadowAddress(Value *Addr, BasicBlock::iterator Pos) { IRBuilder<> IRB(Pos->getParent(), Pos); - Value *ShadowOffset = getShadowOffset(Addr, IRB); - return getShadowAddress(Addr, Pos, ShadowOffset); + Value *ShadowAddr = getShadowOffset(Addr, IRB); + uint64_t ShadowBase = MapParams->ShadowBase; + if (ShadowBase != 0) + ShadowAddr = + IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase)); + return getShadowAddress(Addr, Pos, ShadowAddr); } Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2, diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a6ec6c1..2f256df 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -216,7 +216,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar, } LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to " << Reason << ".\n"); - return; } struct AllocMatchInfo { diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 66e45ec..e84ca81 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -122,16 +122,22 @@ static cl::opt<unsigned> cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50)); -extern cl::opt<bool> ProfcheckDisableMetadataFixes; - -} // namespace llvm - static cl::opt<double> MaxClonedRate( "dfa-max-cloned-rate", cl::desc( "Maximum cloned instructions rate accepted for the transformation"), cl::Hidden, cl::init(7.5)); +static cl::opt<unsigned> + MaxOuterUseBlocks("dfa-max-out-use-blocks", + cl::desc("Maximum unduplicated blocks with outer uses " + "accepted for the transformation"), + cl::Hidden, cl::init(40)); + +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class SelectInstToUnfold { SelectInst *SI; @@ -965,8 +971,16 @@ private: // SLPVectorizer. // TODO: Thread the switch partially before reaching the threshold. uint64_t NumOrigInst = 0; - for (auto *BB : DuplicateMap.keys()) + uint64_t NumOuterUseBlock = 0; + for (auto *BB : DuplicateMap.keys()) { NumOrigInst += BB->sizeWithoutDebug(); + // Only unduplicated blocks with single predecessor require new phi + // nodes. + for (auto *Succ : successors(BB)) + if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor()) + NumOuterUseBlock++; + } + if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " "instructions wll be cloned\n"); @@ -977,6 +991,20 @@ private: return false; } + // Too much unduplicated blocks with outer uses may cause too much + // insertions of phi nodes for duplicated definitions. TODO: Drop this + // threshold if we come up with another way to reduce the number of inserted + // phi nodes. + if (NumOuterUseBlock > MaxOuterUseBlocks) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " + "blocks with outer uses\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Too much blocks with outer uses."; + }); + return false; + } + InstructionCost DuplicationCost = 0; unsigned JumpTableSize = 0; diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index a9ab3b3..27fed73 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -809,7 +809,6 @@ public: void emitInstructionAnnot(const Instruction *I, formatted_raw_ostream &OS) override { if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { - OS << "; Has predicate info\n"; if (const auto *PB = dyn_cast<PredicateBranch>(PI)) { OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge << " Comparison:" << *PB->Condition << " Edge: ["; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index c537be5c..b03fb62 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI, // If either of the blocks has it's address taken, then we can't do this fold, // because the code we'd hoist would no longer run when we jump into the block // by it's address. - for (auto *Succ : successors(BB)) - if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor()) + for (auto *Succ : successors(BB)) { + if (Succ->hasAddressTaken()) return false; - + if (Succ->getSinglePredecessor()) + continue; + // If Succ has >1 predecessors, continue to check if the Succ contains only + // one `unreachable` inst. Since executing `unreachable` inst is an UB, we + // can relax the condition based on the assumptiom that the program would + // never enter Succ and trigger such an UB. + if (isa<UnreachableInst>(*Succ->begin())) + continue; + return false; + } // The second of pair is a SkipFlags bitmask. using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>; SmallVector<SuccIterPair, 8> SuccIterPairs; @@ -5228,32 +5237,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); } - // Create the new switch instruction now. - SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); - if (HasProfile) { - // We know the weight of the default case. We don't know the weight of the - // other cases, but rather than completely lose profiling info, we split - // the remaining probability equally over them. - SmallVector<uint32_t> NewWeights(Values.size() + 1); - NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if - // TrueWhenEqual. - for (auto &V : drop_begin(NewWeights)) - V = BranchWeights[0] / Values.size(); - setBranchWeights(*New, NewWeights, /*IsExpected=*/false); - } - - // Add all of the 'cases' to the switch instruction. - for (ConstantInt *Val : Values) - New->addCase(Val, EdgeBB); + // Check if we can represent the values as a contiguous range. If so, we use a + // range check + conditional branch instead of a switch. + if (Values.front()->getValue() - Values.back()->getValue() == + Values.size() - 1) { + ConstantRange RangeToCheck = ConstantRange::getNonEmpty( + Values.back()->getValue(), Values.front()->getValue() + 1); + APInt Offset, RHS; + ICmpInst::Predicate Pred; + RangeToCheck.getEquivalentICmp(Pred, RHS, Offset); + Value *X = CompVal; + if (!Offset.isZero()) + X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset)); + Value *Cond = + Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS)); + BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB); + if (HasProfile) + setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false); + // We don't need to update PHI nodes since we don't add any new edges. + } else { + // Create the new switch instruction now. + SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + if (HasProfile) { + // We know the weight of the default case. We don't know the weight of the + // other cases, but rather than completely lose profiling info, we split + // the remaining probability equally over them. + SmallVector<uint32_t> NewWeights(Values.size() + 1); + NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped + // if TrueWhenEqual. + for (auto &V : drop_begin(NewWeights)) + V = BranchWeights[0] / Values.size(); + setBranchWeights(*New, NewWeights, /*IsExpected=*/false); + } - // We added edges from PI to the EdgeBB. As such, if there were any - // PHI nodes in EdgeBB, they need entries to be added corresponding to - // the number of edges added. - for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) { - PHINode *PN = cast<PHINode>(BBI); - Value *InVal = PN->getIncomingValueForBlock(BB); - for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) - PN->addIncoming(InVal, BB); + // Add all of the 'cases' to the switch instruction. + for (ConstantInt *Val : Values) + New->addCase(Val, EdgeBB); + + // We added edges from PI to the EdgeBB. As such, if there were any + // PHI nodes in EdgeBB, they need entries to be added corresponding to + // the number of edges added. + for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + Value *InVal = PN->getIncomingValueForBlock(BB); + for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) + PN->addIncoming(InVal, BB); + } } // Erase the old branch instruction. @@ -7603,7 +7632,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); - BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + // BI is handling the default case for SI, and so should share its DebugLoc. + BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4fcaf6d..1b55a3b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5608,6 +5608,7 @@ private: for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; + SmallPtrSet<Value *, 4> ParentsUniqueUsers; // Need to search for the lane since the tree entry can be // reordered. auto *It = find(Bundle->getTreeEntry()->Scalars, In); @@ -5636,6 +5637,22 @@ private: Bundle->getTreeEntry()->isCopyableElement(In)) && "Missed TreeEntry operands?"); + bool IsNonSchedulableWithParentPhiNode = + Bundle->getTreeEntry()->doesNotNeedToSchedule() && + Bundle->getTreeEntry()->UserTreeIndex && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() == + Instruction::PHI; + // Count the number of unique phi nodes, which are the parent for + // parent entry, and exit, if all the unique phis are processed. + if (IsNonSchedulableWithParentPhiNode) { + const TreeEntry *ParentTE = + Bundle->getTreeEntry()->UserTreeIndex.UserTE; + Value *User = ParentTE->Scalars[Lane]; + if (!ParentsUniqueUsers.insert(User).second) + break; + } + for (unsigned OpIdx : seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) if (auto *I = dyn_cast<Instruction>( @@ -5644,8 +5661,8 @@ private: << *I << "\n"); DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked); } - // If parent node is schedulable, it will be handle correctly. - if (!Bundle->getTreeEntry()->doesNotNeedToSchedule()) + // If parent node is schedulable, it will be handled correctly. + if (!IsNonSchedulableWithParentPhiNode) break; It = std::find(std::next(It), Bundle->getTreeEntry()->Scalars.end(), In); @@ -16903,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // otherwise TEPtr depends on TE. if ((TEInsertBlock != InsertPt->getParent() || TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && - !CheckOrdering(InsertPt)) + (!CheckOrdering(InsertPt) || + (UseEI.UserTE->hasCopyableElements() && + isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) && + is_contained(UseEI.UserTE->Scalars, TEInsertPt)))) continue; // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index acad795..4d98014 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } + + // If ValB is a constant and can be safely extended, truncate it to the same + // type as ExtA's operand, then extend it to the same type as ExtA. This + // creates two uniform extends that can more easily be matched by the rest of + // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all + // replaced with the new extend of the constant. + auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, + VPWidenCastRecipe *&ExtB, + VPValue *&ValB, VPWidenRecipe *Mul) { + if (!ExtA || ExtB || !ValB->isLiveIn()) + return; + Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getOpcode(); + const APInt *Const; + if (!match(ValB, m_APInt(Const)) || + !llvm::canConstantBeExtended( + Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc))) + return; + // The truncate ensures that the type of each extended operand is the + // same, and it's been proven that the constant can be extended from + // NarrowTy safely. Necessary since ExtA's extended operand would be + // e.g. an i8, while the const will likely be an i32. This will be + // elided by later optimisations. + VPBuilder Builder(Mul); + auto *Trunc = + Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy); + Type *WideTy = Ctx.Types.inferScalarType(ExtA); + ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy); + Mul->setOperand(1, ExtB); + }; + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); + // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) + ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); + // Match reduce.add/sub(mul(ext, ext)). if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && @@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenRecipe>(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } - // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); @@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // variants. if (Sub) return nullptr; - // Match reduce.add(ext(mul(ext(A), ext(B)))). - // All extend recipes must have same opcode or A == B - // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). - if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), - m_ZExtOrSExt(m_VPValue()))))) { + + // Match reduce.add(ext(mul(A, B))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = - cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe()); - auto *Ext1 = - cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); - if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); + auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); + + // reduce.add(ext(mul(ext, const))) + // -> reduce.add(ext(mul(ext, ext(const)))) + ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul); + + // reduce.add(ext(mul(ext(A), ext(B)))) + // -> reduce.add(mul(wider_ext(A), wider_ext(B))) + // The inner extends must either have the same opcode as the outer extend or + // be the same, in which case the multiply can never result in a negative + // value and the outer extend can be folded away by doing wider + // extends for the operands of the mul. + if (Ext0 && Ext1 && + (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( @@ -4021,7 +4062,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, DenseMap<const SCEV *, Value *> VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { const DataLayout &DL = SE.getDataLayout(); - SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true); + SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false); auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry()); BasicBlock *EntryBB = Entry->getIRBasicBlock(); |
