diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/IPO/GlobalOpt.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/Local.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 40 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 17 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 149 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 |
12 files changed, 182 insertions, 152 deletions
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index f88d51f..99c4982 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV, /// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Fast); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Fast); } static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, @@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F, return false; for (User *U : F.users()) { - CallBase &CB = cast<CallBase>(*U); - Function *CallerFunc = CB.getParent()->getParent(); + CallBase *CB = dyn_cast<CallBase>(U); + if (!CB || CB->getCalledOperand() != &F) + continue; + Function *CallerFunc = CB->getParent()->getParent(); BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); - if (!isColdCallSite(CB, CallerBFI)) + if (!isColdCallSite(*CB, CallerBFI)) return false; if (!llvm::is_contained(AllCallsCold, CallerFunc)) return false; @@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F, static void changeCallSitesToColdCC(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Cold); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Cold); } // This function iterates over all the call instructions in the input Function diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 8fbaf68..ff063f9 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // - or: pick -1 // - select's condition: if the true value is constant, choose it by making // the condition true. + // - phi: pick the common constant across operands // - default: pick 0 // // Note that this transform is intentionally done here rather than @@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid // duplicating logic for binops at least. auto getUndefReplacement = [&](Type *Ty) { - Value *BestValue = nullptr; + auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * { + // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be + // removed. + Constant *BestValue = nullptr; + for (Value *V : PN.incoming_values()) { + if (match(V, m_Freeze(m_Undef()))) + continue; + + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; + + if (!isGuaranteedNotToBeUndefOrPoison(C)) + return nullptr; + + if (BestValue && BestValue != C) + return nullptr; + + BestValue = C; + } + return BestValue; + }; + Value *NullValue = Constant::getNullValue(Ty); - for (const auto *U : I.users()) { + Value *BestValue = nullptr; + for (auto *U : I.users()) { Value *V = NullValue; if (match(U, m_Or(m_Value(), m_Value()))) V = ConstantInt::getAllOnesValue(Ty); @@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) { if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT)) V = NullValue; + } else if (auto *PHI = dyn_cast<PHINode>(U)) { + if (Value *MaybeV = pickCommonConstantFromPHI(*PHI)) + V = MaybeV; } if (!BestValue) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b988957..cf076b9a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -5810,10 +5810,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpbusds_512: case Intrinsic::x86_avx2_vpdpbssd_128: case Intrinsic::x86_avx2_vpdpbssd_256: + case Intrinsic::x86_avx10_vpdpbssd_512: case Intrinsic::x86_avx2_vpdpbssds_128: case Intrinsic::x86_avx2_vpdpbssds_256: - case Intrinsic::x86_avx10_vpdpbssd_512: case Intrinsic::x86_avx10_vpdpbssds_512: + case Intrinsic::x86_avx2_vpdpbsud_128: + case Intrinsic::x86_avx2_vpdpbsud_256: + case Intrinsic::x86_avx10_vpdpbsud_512: + case Intrinsic::x86_avx2_vpdpbsuds_128: + case Intrinsic::x86_avx2_vpdpbsuds_256: + case Intrinsic::x86_avx10_vpdpbsuds_512: + case Intrinsic::x86_avx2_vpdpbuud_128: + case Intrinsic::x86_avx2_vpdpbuud_256: + case Intrinsic::x86_avx10_vpdpbuud_512: + case Intrinsic::x86_avx2_vpdpbuuds_128: + case Intrinsic::x86_avx2_vpdpbuuds_256: + case Intrinsic::x86_avx10_vpdpbuuds_512: handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); break; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 123881e..21b2652 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J, // Preserve !nosanitize if both K and J have it. K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_captures: + K->setMetadata( + Kind, MDNode::fromCaptureComponents( + K->getContext(), MDNode::toCaptureComponents(JMD) | + MDNode::toCaptureComponents(KMD))); + break; } } // Set !invariant.group from J if J has it. If both instructions have it diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ab5c9c9..12fb46d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1762,9 +1762,10 @@ public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL, TTI::TargetCostKind CostKind) - : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), - MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE), - CostKind(CostKind) {} + : DT(DT), LI(LI), TTI(TTI), + SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false), + MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false), + PSE(PSE), CostKind(CostKind) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -3902,8 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4160,8 +4160,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -6836,7 +6835,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7069,8 +7068,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -7486,12 +7484,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, VPSingleDefRecipe *VectorPtr; if (Reverse) { // When folding the tail, we may compute an address that we don't in the - // original scalar loop and it may not be inbounds. Drop Inbounds in that - // case. + // original scalar loop: drop the GEP no-wrap flags in this case. + // Otherwise preserve existing flags without no-unsigned-wrap, as we will + // emit negative indices. GEPNoWrapFlags Flags = - (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + CM.foldTailByMasking() || !GEP ? GEPNoWrapFlags::none() - : GEPNoWrapFlags::inBounds(); + : GEP->getNoWrapFlags().withoutNoUnsignedWrap(); VectorPtr = new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); @@ -8163,14 +8162,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes( std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) { - bool HasScalarVF = Plan->hasScalarVFOnly(); // Now optimize the initial VPlan. - if (!HasScalarVF) - VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, - *Plan, CM.getMinimalBitwidths()); + VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, + *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - if (CM.foldTailWithEVL() && !HasScalarVF) + if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); @@ -8600,8 +8597,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -10058,7 +10054,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c547662..f77d587 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2105,6 +2105,7 @@ public: UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + TreeEntryToStridedPtrInfoMap.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -8948,6 +8949,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { void BoUpSLP::buildTree(ArrayRef<Value *> Roots, const SmallDenseSet<Value *> &UserIgnoreLst) { deleteTree(); + assert(TreeEntryToStridedPtrInfoMap.empty() && + "TreeEntryToStridedPtrInfoMap is not cleared"); UserIgnoreList = &UserIgnoreLst; if (!allSameType(Roots)) return; @@ -8956,6 +8959,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { deleteTree(); + assert(TreeEntryToStridedPtrInfoMap.empty() && + "TreeEntryToStridedPtrInfoMap is not cleared"); if (!allSameType(Roots)) return; buildTreeRec(Roots, 0, EdgeInfo()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 728d291..81f1956 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1750,8 +1750,7 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, - bool AlwaysIncludeReplicatingR) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { if (VF.isScalar()) return 0; @@ -1771,9 +1770,7 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || - (!AlwaysIncludeReplicatingR && - isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || + if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0822511..10d704d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2997,6 +2997,10 @@ class VPExpressionRecipe : public VPSingleDefRecipe { /// vector operands, performing a reduction.add on the result, and adding /// the scalar result to a chain. MulAccReduction, + /// Represent an inloop multiply-accumulate reduction, multiplying the + /// extended vector operands, negating the multiplication, performing a + /// reduction.add on the result, and adding the scalar result to a chain. + ExtNegatedMulAccReduction, }; /// Type of the expression. @@ -3020,6 +3024,19 @@ public: VPWidenRecipe *Mul, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction, {Ext0, Ext1, Mul, Red}) {} + VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenRecipe *Mul, VPWidenRecipe *Sub, + VPReductionRecipe *Red) + : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction, + {Ext0, Ext1, Mul, Sub, Red}) { + assert(Mul->getOpcode() == Instruction::Mul && "Expected a mul"); + assert(Red->getRecurrenceKind() == RecurKind::Add && + "Expected an add reduction"); + assert(getNumOperands() >= 3 && "Expected at least three operands"); + [[maybe_unused]] auto *SubConst = dyn_cast<ConstantInt>(getOperand(2)->getLiveInIRValue()); + assert(SubConst && SubConst->getValue() == 0 && + Sub->getOpcode() == Instruction::Sub && "Expected a negating sub"); + } ~VPExpressionRecipe() override { for (auto *R : reverse(ExpressionRecipes)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2a8baec..fe59774 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,14 +349,12 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; - ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + TargetTransformInfo::TargetCostKind CostKind) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -376,12 +374,10 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR - /// is true, always compute the cost of scalarizing replicating operands. - InstructionCost - getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, - ElementCount VF, - bool AlwaysIncludeReplicatingR = false); + /// type-based getScalarizationOverhead API. + InstructionCost getScalarizationOverhead(Type *ResultTy, + ArrayRef<const VPValue *> Operands, + ElementCount VF); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b5e30cb..3a55710 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2839,12 +2839,17 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, Ctx.CostKind); - case ExpressionTypes::ExtMulAccReduction: + case ExpressionTypes::ExtNegatedMulAccReduction: + assert(Opcode == Instruction::Add && "Unexpected opcode"); + Opcode = Instruction::Sub; + LLVM_FALLTHROUGH; + case ExpressionTypes::ExtMulAccReduction: { return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, Opcode, RedTy, SrcVecTy, Ctx.CostKind); } + } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -2890,6 +2895,30 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << ")"; break; } + case ExpressionTypes::ExtNegatedMulAccReduction: { + getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); + O << " + reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) + << " (sub (0, mul"; + auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); + Mul->printFlags(O); + O << "("; + getOperand(0)->printAsOperand(O, SlotTracker); + auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); + O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " + << *Ext0->getResultType() << "), ("; + getOperand(1)->printAsOperand(O, SlotTracker); + auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); + O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " + << *Ext1->getResultType() << ")"; + if (Red->isConditional()) { + O << ", "; + Red->getCondOp()->printAsOperand(O, SlotTracker); + } + O << "))"; + break; + } case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); @@ -3069,61 +3098,6 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { - auto *PtrR = Ptr->getDefiningRecipe(); - if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && - cast<VPReplicateRecipe>(PtrR)->getOpcode() == - Instruction::GetElementPtr) || - isa<VPWidenGEPRecipe>(PtrR))) - return false; - - // We are looking for a GEP where all indices are either loop invariant or - // inductions. - for (VPValue *Opd : drop_begin(PtrR->operands())) { - if (!Opd->isDefinedOutsideLoopRegions() && - !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) - return false; - } - - return true; -} - -/// Returns true if \p V is used as part of the address of another load or -/// store. -static bool isUsedByLoadStoreAddress(const VPUser *V) { - SmallPtrSet<const VPUser *, 4> Seen; - SmallVector<const VPUser *> WorkList = {V}; - - while (!WorkList.empty()) { - auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); - if (!Cur || !Seen.insert(Cur).second) - continue; - - for (VPUser *U : Cur->users()) { - if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) - if (InterleaveR->getAddr() == Cur) - return true; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { - if (RepR->getOpcode() == Instruction::Load && - RepR->getOperand(0) == Cur) - return true; - if (RepR->getOpcode() == Instruction::Store && - RepR->getOperand(1) == Cur) - return true; - } - if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { - if (MemR->getAddr() == Cur && MemR->isConsecutive()) - return true; - } - } - - append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); - } - return false; -} - InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3231,58 +3205,21 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (VF.isScalable() && !isSingleScalar()) - return InstructionCost::getInvalid(); - + if (isSingleScalar()) { + bool IsLoad = UI->getOpcode() == Instruction::Load; + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); + return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); + } // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); - if (ParentRegion && ParentRegion->isReplicator()) - break; - - bool IsLoad = UI->getOpcode() == Instruction::Load; - const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) - break; - - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); - - Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); - - InstructionCost ScalarCost = - ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); - if (isSingleScalar()) - return ScalarCost; - - SmallVector<const VPValue *> OpsToScalarize; - Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); - // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we - // don't assign scalarization overhead in general, if the target prefers - // vectorized addressing or the loaded value is used as part of an address - // of another load or store. - bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); - if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { - bool EfficientVectorLoadStore = - Ctx.TTI.supportsEfficientVectorElementLoadStore(); - if (!(IsLoad && !PreferVectorizedAddressing) && - !(!IsLoad && EfficientVectorLoadStore)) - append_range(OpsToScalarize, operands()); - - if (!EfficientVectorLoadStore) - ResultTy = Ctx.Types.inferScalarType(this); - } - - return (ScalarCost * VF.getFixedValue()) + - Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); + break; } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5252e1f..a73b083 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2124,6 +2124,8 @@ static void licm(VPlan &Plan) { void VPlanTransforms::truncateToMinimalBitwidths( VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) { + if (Plan.hasScalarVFOnly()) + return; // Keep track of created truncates, so they can be re-used. Note that we // cannot use RAUW after creating a new truncate, as this would could make // other uses have different types for their operands, making them invalidly @@ -2704,6 +2706,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) { + if (Plan.hasScalarVFOnly()) + return; VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); @@ -3543,7 +3547,15 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, }; VPValue *VecOp = Red->getVecOp(); + VPRecipeBase *Sub = nullptr; VPValue *A, *B; + VPValue *Tmp = nullptr; + // Sub reductions could have a sub between the add reduction and vec op. + if (match(VecOp, + m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Tmp)))) { + Sub = VecOp->getDefiningRecipe(); + VecOp = Tmp; + } // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3560,12 +3572,21 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, IsMulAccValidAndClampRange(RecipeA->getOpcode() == Instruction::CastOps::ZExt, Mul, RecipeA, RecipeB, nullptr)) { + if (Sub) + return new VPExpressionRecipe(RecipeA, RecipeB, Mul, + cast<VPWidenRecipe>(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } // Match reduce.add(mul). - if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + // TODO: Add an expression type for this variant with a negated mul + if (!Sub && + IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); } + // TODO: Add an expression type for negated versions of other expression + // variants. + if (Sub) + return nullptr; // Match reduce.add(ext(mul(ext(A), ext(B)))). // All extend recipes must have same opcode or A == B // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 32704bd..d6eb00d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1031,6 +1031,16 @@ bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) { // Create the cast operation directly to ensure we get a new instruction Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType()); + // Preserve cast instruction flags + if (RHSFlags.NNeg) + NewCast->setNonNeg(); + if (RHSFlags.NUW) + NewCast->setHasNoUnsignedWrap(); + if (RHSFlags.NSW) + NewCast->setHasNoSignedWrap(); + + NewCast->andIRFlags(LHSCast); + // Insert the new instruction Value *Result = Builder.Insert(NewCast); |