diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/Utils/SCCPSolver.cpp | 96 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 17 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 7 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanUtils.h | 6 |
11 files changed, 186 insertions, 40 deletions
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b80c3c9..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -760,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - ValueLatticeElement &PhiStateRef = ValueState[&PN]; - mergeInValue(PhiStateRef, &PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 280eb20..febdc54 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks, + BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9cd52da..048a3e6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5343,7 +5343,7 @@ private: unsigned &OpCnt = OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); EdgeInfo EI(TE, U.getOperandNo()); - if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + if (!getScheduleCopyableData(EI, Op)) continue; // Found copyable operand - continue. ++OpCnt; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 84d2ea6..fed04eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1007,6 +1007,11 @@ public: /// Creates a fixed-width vector containing all operands. The number of /// operands matches the vector element count. BuildVector, + /// Extracts all lanes from its (non-scalable) vector operand. This is an + /// abstract VPInstruction whose single defined VPValue represents VF + /// scalars extracted from a vector, to be replaced by VF ExtractElement + /// VPInstructions. + Unpack, /// Compute the final result of a AnyOf reduction with select(cmp(),x,y), /// where one of (x,y) is loop invariant, and both x and y are integer type. ComputeAnyOfResult, @@ -2715,6 +2720,15 @@ public: return R && classof(R); } + static inline bool classof(const VPValue *VPV) { + const VPRecipeBase *R = VPV->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPSingleDefRecipe *R) { + return classof(static_cast<const VPRecipeBase *>(R)); + } + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; @@ -3100,6 +3114,9 @@ public: /// Returns true if this expression contains recipes that may have side /// effects. bool mayHaveSideEffects() const; + + /// Returns true if the result of this VPExpressionRecipe is a single-scalar. + bool isSingleScalar() const; }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 7e074c1..80a2e4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: + case VPInstruction::Unpack: return SetResultTyFromOp(); case VPInstruction::ExtractLane: return inferScalarType(R->getOperand(1)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index d8203e2..b5b98c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } +template <typename Op0_t, typename Op1_t> +inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t> +m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) { + return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1); +} + template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t> m_ExtractLastLanePerPart(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d1e67e6b..a865b2d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: case VPInstruction::Not: + case VPInstruction::Unpack: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: case VPInstruction::VScale: + case VPInstruction::Unpack: return false; default: return true; @@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); case VPInstruction::WidePtrAdd: - return Op == getOperand(0); + // WidePtrAdd supports scalar and vector base addresses. + return false; case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); @@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ResumeForEpilogue: O << "resume-for-epilogue"; break; + case VPInstruction::Unpack: + O << "unpack"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { return false; } +bool VPExpressionRecipe::isSingleScalar() const { + // Cannot use vputils::isSingleScalar(), because all external operands + // of the expression will be live-ins while bundled. + return isa<VPReductionRecipe>(ExpressionRecipes.back()) && + !isa<VPPartialReductionRecipe>(ExpressionRecipes.back()); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5f616f..fa1fdaf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1224,6 +1224,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + uint64_t Idx; + if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -3780,7 +3787,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } -void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { +void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; @@ -3828,6 +3835,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { }); } } + + // Create explicit VPInstructions to convert vectors to scalars. The current + // implementation is conservative - it may miss some cases that may or may not + // be vector values. TODO: introduce Unpacks speculatively - remove them later + // if they are known to operate on scalar values. + for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe, + VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R)) + continue; + for (VPValue *Def : R.definedValues()) { + // Skip recipes that are single-scalar or only have their first lane + // used. + // TODO: The Defs skipped here may or may not be vector values. + // Introduce Unpacks, and remove them later, if they are guaranteed to + // produce scalar values. + if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def)) + continue; + + // At the moment, we create unpacks only for scalar users outside + // replicate regions. Recipes inside replicate regions still extract the + // required lanes implicitly. + // TODO: Remove once replicate regions are unrolled completely. + auto IsCandidateUnpackUser = [Def](VPUser *U) { + VPRegionBlock *ParentRegion = + cast<VPRecipeBase>(U)->getParent()->getParent(); + return U->usesScalars(Def) && + (!ParentRegion || !ParentRegion->isReplicator()); + }; + if (none_of(Def->users(), IsCandidateUnpackUser)) + continue; + + auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def}); + if (R.isPhi()) + Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi()); + else + Unpack->insertAfter(&R); + Def->replaceUsesWithIf(Unpack, + [&IsCandidateUnpackUser](VPUser &U, unsigned) { + return IsCandidateUnpackUser(&U); + }); + } + } + } } void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bb..b28559b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -325,9 +325,10 @@ struct VPlanTransforms { static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH); - /// Add explicit Build[Struct]Vector recipes that combine multiple scalar - /// values into single vectors. - static void materializeBuildVectors(VPlan &Plan); + /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values + /// into vectors and Unpack recipes to extract scalars from vectors as + /// needed. + static void materializePacksAndUnpacks(VPlan &Plan); /// Materialize VF and VFxUF to be computed explicitly using VPInstructions. static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 5aeda3e..cfd1a74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar /// definitions for operands of \DefR. -static VPRecipeWithIRFlags * +static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPRecipeWithIRFlags *DefR, VPLane Lane, const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { + VPValue *Op; + if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) { + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) + return LaneDefs->second[Lane.getKnownLane()]; + + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + // Collect the operands at Lane, creating extracts as needed. SmallVector<VPValue *> NewOps; for (VPValue *Op : DefR->operands()) { @@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, continue; } if (Lane.getKind() == VPLane::Kind::ScalableLast) { + // Look through mandatory Unpack. + [[maybe_unused]] bool Matched = + match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op))); + assert(Matched && "original op must have been Unpack"); NewOps.push_back( Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; @@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { (isa<VPReplicateRecipe>(&R) && cast<VPReplicateRecipe>(&R)->isSingleScalar()) || (isa<VPInstruction>(&R) && - !cast<VPInstruction>(&R)->doesGeneratePerAllLanes())) + !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() && + cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack)) continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9a2497e..840a5b9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); + if (isa<VPPartialReductionRecipe>(VPV)) + return false; + if (isa<VPReductionRecipe>(VPV)) + return true; + if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV)) + return Expr->isSingleScalar(); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa<VPExpandSCEVRecipe>(VPV); |