diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 119 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 26 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 40 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 29 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 126 |
8 files changed, 303 insertions, 72 deletions
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 8d9a0e7..50130da 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -2067,6 +2067,36 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, AI.run(SCCNodes, Changed); } +// Determines if the function 'F' can be marked 'norecurse'. +// It returns true if any call within 'F' could lead to a recursive +// call back to 'F', and false otherwise. +// The 'AnyFunctionsAddressIsTaken' parameter is a module-wide flag +// that is true if any function's address is taken, or if any function +// has external linkage. This is used to determine the safety of +// external/library calls. +static bool mayHaveRecursiveCallee(Function &F, + bool AnyFunctionsAddressIsTaken = true) { + for (const auto &BB : F) { + for (const auto &I : BB.instructionsWithoutDebug()) { + if (const auto *CB = dyn_cast<CallBase>(&I)) { + const Function *Callee = CB->getCalledFunction(); + if (!Callee || Callee == &F) + return true; + + if (Callee->doesNotRecurse()) + continue; + + if (!AnyFunctionsAddressIsTaken || + (Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::NoCallback))) + continue; + return true; + } + } + } + return false; +} + static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, SmallPtrSet<Function *, 8> &Changed) { // Try and identify functions that do not recurse. @@ -2078,28 +2108,14 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, Function *F = *SCCNodes.begin(); if (!F || !F->hasExactDefinition() || F->doesNotRecurse()) return; - - // If all of the calls in F are identifiable and are to norecurse functions, F - // is norecurse. This check also detects self-recursion as F is not currently - // marked norecurse, so any called from F to F will not be marked norecurse. - for (auto &BB : *F) - for (auto &I : BB.instructionsWithoutDebug()) - if (auto *CB = dyn_cast<CallBase>(&I)) { - Function *Callee = CB->getCalledFunction(); - if (!Callee || Callee == F || - (!Callee->doesNotRecurse() && - !(Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::NoCallback)))) - // Function calls a potentially recursive function. - return; - } - - // Every call was to a non-recursive function other than this function, and - // we have no indirect recursion as the SCC size is one. This function cannot - // recurse. - F->setDoesNotRecurse(); - ++NumNoRecurse; - Changed.insert(F); + if (!mayHaveRecursiveCallee(*F)) { + // Every call was to a non-recursive function other than this function, and + // we have no indirect recursion as the SCC size is one. This function + // cannot recurse. + F->setDoesNotRecurse(); + ++NumNoRecurse; + Changed.insert(F); + } } // Set the noreturn function attribute if possible. @@ -2429,3 +2445,62 @@ ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { PA.preserve<LazyCallGraphAnalysis>(); return PA; } + +PreservedAnalyses NoRecurseLTOInferencePass::run(Module &M, + ModuleAnalysisManager &MAM) { + + // Check if any function in the whole program has its address taken or has + // potentially external linkage. + // We use this information when inferring norecurse attribute: If there is + // no function whose address is taken and all functions have internal + // linkage, there is no path for a callback to any user function. + bool AnyFunctionsAddressIsTaken = false; + for (Function &F : M) { + if (F.isDeclaration() || F.doesNotRecurse()) + continue; + if (!F.hasLocalLinkage() || F.hasAddressTaken()) { + AnyFunctionsAddressIsTaken = true; + break; + } + } + + // Run norecurse inference on all RefSCCs in the LazyCallGraph for this + // module. + bool Changed = false; + LazyCallGraph &CG = MAM.getResult<LazyCallGraphAnalysis>(M); + CG.buildRefSCCs(); + + for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { + // Skip any RefSCC that is part of a call cycle. A RefSCC containing more + // than one SCC indicates a recursive relationship involving indirect calls. + if (RC.size() > 1) + continue; + + // RefSCC contains a single-SCC. SCC size > 1 indicates mutually recursive + // functions. Ex: foo1 -> foo2 -> foo3 -> foo1. + LazyCallGraph::SCC &S = *RC.begin(); + if (S.size() > 1) + continue; + + // Get the single function from this SCC. + Function &F = S.begin()->getFunction(); + if (!F.hasExactDefinition() || F.doesNotRecurse()) + continue; + + // If the analysis confirms that this function has no recursive calls + // (either direct, indirect, or through external linkages), + // we can safely apply the norecurse attribute. + if (!mayHaveRecursiveCallee(F, AnyFunctionsAddressIsTaken)) { + F.setDoesNotRecurse(); + ++NumNoRecurse; + Changed = true; + } + } + + PreservedAnalyses PA; + if (Changed) + PA.preserve<LazyCallGraphAnalysis>(); + else + PA = PreservedAnalyses::all(); + return PA; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 8f60e50..8c8fc69 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3356,7 +3356,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { impliesPoisonOrCond(FalseVal, B, /*Expected=*/false)) { // (A || B) || C --> A || (B | C) return replaceInstUsesWith( - SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal))); + SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal), "", + ProfcheckDisableMetadataFixes + ? nullptr + : cast<SelectInst>(CondVal))); } // (A && B) || (C && B) --> (A || C) && B @@ -3398,7 +3401,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { impliesPoisonOrCond(TrueVal, B, /*Expected=*/true)) { // (A && B) && C --> A && (B & C) return replaceInstUsesWith( - SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal))); + SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal), "", + ProfcheckDisableMetadataFixes + ? nullptr + : cast<SelectInst>(CondVal))); } // (A || B) && (C || B) --> (A && C) || B diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index e9a3e98..41a6c80 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -120,6 +120,12 @@ static cl::opt<unsigned> cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50)); +static cl::opt<double> MaxClonedRate( + "dfa-max-cloned-rate", + cl::desc( + "Maximum cloned instructions rate accepted for the transformation"), + cl::Hidden, cl::init(7.5)); + namespace { class SelectInstToUnfold { @@ -828,6 +834,7 @@ private: /// also returns false if it is illegal to clone some required block. bool isLegalAndProfitableToTransform() { CodeMetrics Metrics; + uint64_t NumClonedInst = 0; SwitchInst *Switch = SwitchPaths->getSwitchInst(); // Don't thread switch without multiple successors. @@ -837,7 +844,6 @@ private: // Note that DuplicateBlockMap is not being used as intended here. It is // just being used to ensure (BB, State) pairs are only counted once. DuplicateBlockMap DuplicateMap; - for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { PathType PathBBs = TPath.getPath(); APInt NextState = TPath.getExitValue(); @@ -848,6 +854,7 @@ private: BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); if (!VisitedBB) { Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + NumClonedInst += BB->sizeWithoutDebug(); DuplicateMap[BB].push_back({BB, NextState}); } @@ -865,6 +872,7 @@ private: if (VisitedBB) continue; Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + NumClonedInst += BB->sizeWithoutDebug(); DuplicateMap[BB].push_back({BB, NextState}); } @@ -901,6 +909,22 @@ private: } } + // Too much cloned instructions slow down later optimizations, especially + // SLPVectorizer. + // TODO: Thread the switch partially before reaching the threshold. + uint64_t NumOrigInst = 0; + for (auto *BB : DuplicateMap.keys()) + NumOrigInst += BB->sizeWithoutDebug(); + if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " + "instructions wll be cloned\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Too much instructions will be cloned."; + }); + return false; + } + InstructionCost DuplicationCost = 0; unsigned JumpTableSize = 0; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 148bfa8..b8cfe3a 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -4895,9 +4895,8 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm, // We found both of the successors we were looking for. // Create a conditional branch sharing the condition of the select. BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB); - if (TrueWeight != FalseWeight) - setBranchWeights(*NewBI, {TrueWeight, FalseWeight}, - /*IsExpected=*/false, /*ElideAllZero=*/true); + setBranchWeights(*NewBI, {TrueWeight, FalseWeight}, + /*IsExpected=*/false, /*ElideAllZero=*/true); } } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { // Neither of the selected blocks were successors, so this @@ -4982,9 +4981,15 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI, BasicBlock *TrueBB = TBA->getBasicBlock(); BasicBlock *FalseBB = FBA->getBasicBlock(); + // The select's profile becomes the profile of the conditional branch that + // replaces the indirect branch. + SmallVector<uint32_t> SelectBranchWeights(2); + if (!ProfcheckDisableMetadataFixes) + extractBranchWeights(*SI, SelectBranchWeights); // Perform the actual simplification. - return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0, - 0); + return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, + SelectBranchWeights[0], + SelectBranchWeights[1]); } /// This is called when we find an icmp instruction @@ -7952,19 +7957,27 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { BasicBlock *BB = IBI->getParent(); bool Changed = false; + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = !ProfcheckDisableMetadataFixes && + extractBranchWeights(*IBI, BranchWeights); + + DenseMap<const BasicBlock *, uint64_t> TargetWeight; + if (HasBranchWeights) + for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I) + TargetWeight[IBI->getDestination(I)] += BranchWeights[I]; // Eliminate redundant destinations. SmallPtrSet<Value *, 8> Succs; SmallSetVector<BasicBlock *, 8> RemovedSuccs; - for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { - BasicBlock *Dest = IBI->getDestination(i); + for (unsigned I = 0, E = IBI->getNumDestinations(); I != E; ++I) { + BasicBlock *Dest = IBI->getDestination(I); if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) { if (!Dest->hasAddressTaken()) RemovedSuccs.insert(Dest); Dest->removePredecessor(BB); - IBI->removeDestination(i); - --i; - --e; + IBI->removeDestination(I); + --I; + --E; Changed = true; } } @@ -7990,7 +8003,12 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { eraseTerminatorAndDCECond(IBI); return true; } - + if (HasBranchWeights) { + SmallVector<uint64_t> NewBranchWeights(IBI->getNumDestinations()); + for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I) + NewBranchWeights[I] += TargetWeight.find(IBI->getDestination(I))->second; + setFittedBranchWeights(*IBI, NewBranchWeights, /*IsExpected=*/false); + } if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (simplifyIndirectBrOnSelect(IBI, SI)) return requestResimplify(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e434e73..cee08ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8393,11 +8396,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( R->setOperand(1, WideIV->getStepValue()); } - VPlanTransforms::runPass( - VPlanTransforms::addExitUsersForFirstOrderRecurrences, *Plan, Range); + // TODO: We can't call runPass on these transforms yet, due to verifier + // failures. + VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range); DenseMap<VPValue *, VPValue *> IVEndValues; - VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan, - RecipeBuilder, IVEndValues); + VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8508,8 +8512,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { DenseMap<VPValue *, VPValue *> IVEndValues; // TODO: IVEndValues are not used yet in the native path, to optimize exit // values. - VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan, - RecipeBuilder, IVEndValues); + // TODO: We can't call runPass on the transform yet, due to verifier + // failures. + VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -9873,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 07b191a..2555ebe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || + (isa<VPReplicateRecipe>(Op) && + cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fc1a09e..1580a3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef<const VPValue *> Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 67b9244..94e2628 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -40,6 +40,7 @@ #include <cassert> using namespace llvm; +using namespace llvm::VPlanPatternMatch; using VectorParts = SmallVector<Value *, 2>; @@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, VPRecipeBase *OpR = Op->getDefiningRecipe(); // If the partial reduction is predicated, a select will be operand 0 - using namespace llvm::VPlanPatternMatch; if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { OpR = Op->getDefiningRecipe(); } @@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); VPValue *Op0, *Op1; - using namespace llvm::VPlanPatternMatch; if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { @@ -2778,7 +2777,7 @@ VPExpressionRecipe::VPExpressionRecipe( // Recipes in the expression, except the last one, must only be used by // (other) recipes inside the expression. If there are other users, external // to the expression, use a clone of the recipe for external users. - for (VPSingleDefRecipe *R : ExpressionRecipes) { + for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) { if (R != ExpressionRecipes.back() && any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) { return !ExpressionRecipesAsSetOfUsers.contains(U); @@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && + cast<VPReplicateRecipe>(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa<VPWidenGEPRecipe>(PtrR) || + match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet<const VPUser *, 4> Seen; + SmallVector<const VPUser *> WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + bool UsedByLoadStoreAddress = + !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, + nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector<const VPValue *> OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + if (!UsedByLoadStoreAddress) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } |