//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "VPlanAnalysis.h" #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; #define DEBUG_TYPE "vplan" VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) { if (auto LoopRegion = Plan.getVectorLoopRegion()) { if (const auto *CanIV = dyn_cast( &LoopRegion->getEntryBasicBlock()->front())) { CanonicalIVTy = CanIV->getScalarType(); return; } } // If there's no canonical IV, retrieve the type from the trip count // expression. auto *TC = Plan.getTripCount(); if (TC->isLiveIn()) { CanonicalIVTy = TC->getLiveInIRValue()->getType(); return; } CanonicalIVTy = cast(TC)->getSCEV()->getType(); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) { Type *ResTy = inferScalarType(R->getIncomingValue(0)); for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) { VPValue *Inc = R->getIncomingValue(I); assert(inferScalarType(Inc) == ResTy && "different types inferred for different incoming values"); CachedTypes[Inc] = ResTy; } return ResTy; } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { // Set the result type from the first operand, check if the types for all // other operands match and cache them. auto SetResultTyFromOp = [this, R]() { Type *ResTy = inferScalarType(R->getOperand(0)); for (unsigned Op = 1; Op != R->getNumOperands(); ++Op) { VPValue *OtherV = R->getOperand(Op); assert(inferScalarType(OtherV) == ResTy && "different types inferred for different operands"); CachedTypes[OtherV] = ResTy; } return ResTy; }; unsigned Opcode = R->getOpcode(); if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) return SetResultTyFromOp(); switch (Opcode) { case Instruction::ExtractElement: case Instruction::Freeze: case VPInstruction::ReductionStartVector: return inferScalarType(R->getOperand(0)); case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); assert(inferScalarType(OtherV) == ResTy && "different types inferred for different operands"); CachedTypes[OtherV] = ResTy; return ResTy; } case Instruction::ICmp: case VPInstruction::ActiveLaneMask: assert(inferScalarType(R->getOperand(0)) == inferScalarType(R->getOperand(1)) && "different types inferred for different operands"); return IntegerType::get(Ctx, 1); case VPInstruction::ComputeAnyOfResult: return inferScalarType(R->getOperand(1)); case VPInstruction::ComputeFindIVResult: case VPInstruction::ComputeReductionResult: { return inferScalarType(R->getOperand(0)); } case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); case Instruction::PHI: // Infer the type of first operand only, as other operands of header phi's // may lead to infinite recursion. return inferScalarType(R->getOperand(0)); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: return SetResultTyFromOp(); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) return VecTy->getElementType(); return BaseTy; } case VPInstruction::LogicalAnd: assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) && inferScalarType(R->getOperand(1))->isIntegerTy(1) && "LogicalAnd operands should be bool"); return IntegerType::get(Ctx, 1); case VPInstruction::Broadcast: case VPInstruction::PtrAdd: // Return the type based on first operand. return inferScalarType(R->getOperand(0)); case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: return Type::getVoidTy(Ctx); default: break; } // Type inference not implemented for opcode. LLVM_DEBUG({ dbgs() << "LV: Found unhandled opcode for: "; R->getVPSingleValue()->dump(); }); llvm_unreachable("Unhandled opcode!"); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) { unsigned Opcode = R->getOpcode(); if (Instruction::isBinaryOp(Opcode) || Instruction::isShift(Opcode) || Instruction::isBitwiseLogicOp(Opcode)) { Type *ResTy = inferScalarType(R->getOperand(0)); assert(ResTy == inferScalarType(R->getOperand(1)) && "types for both operands must match for binary op"); CachedTypes[R->getOperand(1)] = ResTy; return ResTy; } switch (Opcode) { case Instruction::ICmp: case Instruction::FCmp: return IntegerType::get(Ctx, 1); case Instruction::FNeg: case Instruction::Freeze: return inferScalarType(R->getOperand(0)); case Instruction::ExtractValue: { assert(R->getNumOperands() == 2 && "expected single level extractvalue"); auto *StructTy = cast(inferScalarType(R->getOperand(0))); auto *CI = cast(R->getOperand(1)->getLiveInIRValue()); return StructTy->getTypeAtIndex(CI->getZExtValue()); } default: break; } // Type inference not implemented for opcode. LLVM_DEBUG({ dbgs() << "LV: Found unhandled opcode for: "; R->getVPSingleValue()->dump(); }); llvm_unreachable("Unhandled opcode!"); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { auto &CI = *cast(R->getUnderlyingInstr()); return CI.getType(); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { assert((isa(R)) && "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); assert(inferScalarType(OtherV) == ResTy && "different types inferred for different operands"); CachedTypes[OtherV] = ResTy; return ResTy; } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) { unsigned Opcode = R->getUnderlyingInstr()->getOpcode(); if (Instruction::isBinaryOp(Opcode) || Instruction::isShift(Opcode) || Instruction::isBitwiseLogicOp(Opcode)) { Type *ResTy = inferScalarType(R->getOperand(0)); assert(ResTy == inferScalarType(R->getOperand(1)) && "inferred types for operands of binary op don't match"); CachedTypes[R->getOperand(1)] = ResTy; return ResTy; } if (Instruction::isCast(Opcode)) return R->getUnderlyingInstr()->getType(); switch (Opcode) { case Instruction::Call: { unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1); return cast(R->getOperand(CallIdx)->getLiveInIRValue()) ->getReturnType(); } case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); assert(ResTy == inferScalarType(R->getOperand(2)) && "inferred types for operands of select op don't match"); CachedTypes[R->getOperand(2)] = ResTy; return ResTy; } case Instruction::ICmp: case Instruction::FCmp: return IntegerType::get(Ctx, 1); case Instruction::Alloca: case Instruction::ExtractValue: return R->getUnderlyingInstr()->getType(); case Instruction::Freeze: case Instruction::FNeg: case Instruction::GetElementPtr: return inferScalarType(R->getOperand(0)); case Instruction::Load: return cast(R->getUnderlyingInstr())->getType(); case Instruction::Store: // FIXME: VPReplicateRecipes with store opcodes still define a result // VPValue, so we need to handle them here. Remove the code here once this // is modeled accurately in VPlan. return Type::getVoidTy(Ctx); default: break; } // Type inference not implemented for opcode. LLVM_DEBUG({ dbgs() << "LV: Found unhandled opcode for: "; R->getVPSingleValue()->dump(); }); llvm_unreachable("Unhandled opcode"); } Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { if (Type *CachedTy = CachedTypes.lookup(V)) return CachedTy; if (V->isLiveIn()) { if (auto *IRValue = V->getLiveInIRValue()) return IRValue->getType(); // All VPValues without any underlying IR value (like the vector trip count // or the backedge-taken count) have the same type as the canonical IV. return CanonicalIVTy; } Type *ResultTy = TypeSwitch(V->getDefiningRecipe()) .Case( [this](const auto *R) { // Handle header phi recipes, except VPWidenIntOrFpInduction // which needs special handling due it being possibly truncated. // TODO: consider inferring/caching type of siblings, e.g., // backedge value, here and in cases below. return inferScalarType(R->getStartValue()); }) .Case( [](const auto *R) { return R->getScalarType(); }) .Case([this](const VPRecipeBase *R) { return inferScalarType(R->getOperand(0)); }) // VPInstructionWithType must be handled before VPInstruction. .Case( [](const auto *R) { return R->getResultType(); }) .Case( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([V](const VPInterleaveRecipe *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) .Case([this](const auto *R) { return inferScalarType(R->getChainOp()); }) .Case([this](const auto *R) { return inferScalarType(R->getOperandOfResultType()); }); assert(ResultTy && "could not infer type for the given VPValue"); CachedTypes[V] = ResultTy; return ResultTy; } void llvm::collectEphemeralRecipesForVPlan( VPlan &Plan, DenseSet &EphRecipes) { // First, collect seed recipes which are operands of assumes. SmallVector Worklist; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : *VPBB) { auto *RepR = dyn_cast(&R); if (!RepR || !match(RepR->getUnderlyingInstr(), PatternMatch::m_Intrinsic())) continue; Worklist.push_back(RepR); EphRecipes.insert(RepR); } } // Process operands of candidates in worklist and add them to the set of // ephemeral recipes, if they don't have side-effects and are only used by // other ephemeral recipes. while (!Worklist.empty()) { VPRecipeBase *Cur = Worklist.pop_back_val(); for (VPValue *Op : Cur->operands()) { auto *OpR = Op->getDefiningRecipe(); if (!OpR || OpR->mayHaveSideEffects() || EphRecipes.contains(OpR)) continue; if (any_of(Op->users(), [EphRecipes](VPUser *U) { auto *UR = dyn_cast(U); return !UR || !EphRecipes.contains(UR); })) continue; EphRecipes.insert(OpR); Worklist.push_back(OpR); } } } template void DomTreeBuilder::Calculate>( DominatorTreeBase &DT); bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) { if (A == B) return false; auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { for (auto &R : *A->getParent()) { if (&R == A) return true; if (&R == B) return false; } llvm_unreachable("recipe not found"); }; const VPBlockBase *ParentA = A->getParent(); const VPBlockBase *ParentB = B->getParent(); if (ParentA == ParentB) return LocalComesBefore(A, B); #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { auto *Region = dyn_cast_or_null(R->getParent()->getParent()); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); assert(R->getParent()->size() == 1 && "A recipe in an original replicator region must be the only " "recipe in its block"); return Region; } return nullptr; }; assert(!GetReplicateRegion(const_cast(A)) && "No replicate regions expected at this point"); assert(!GetReplicateRegion(const_cast(B)) && "No replicate regions expected at this point"); #endif return Base::properlyDominates(ParentA, ParentB); } /// Get the VF scaling factor applied to the recipe's output, if the recipe has /// one. static unsigned getVFScaleFactor(VPRecipeBase *R) { if (auto *RR = dyn_cast(R)) return RR->getVFScaleFactor(); if (auto *RR = dyn_cast(R)) return RR->getVFScaleFactor(); assert( (!isa(R) || cast(R)->getOpcode() != VPInstruction::ReductionStartVector) && "getting scaling factor of reduction-start-vector not implemented yet"); return 1; } bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { return any_of(MaxLocalUsers, [&TTI](auto &LU) { return LU.second > TTI.getNumberOfRegisters(LU.first); }); } SmallVector llvm::calculateRegisterUsageForPlan( VPlan &Plan, ArrayRef VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl &ValuesToIgnore) { // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the // recipe that is the key. using IntervalMap = SmallDenseMap; // Maps indices to recipes. SmallVector Idx2Recipe; // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of recipe indices that are used in the loop. SmallPtrSet Ends; // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-recipe values such as arguments and // constants). SmallSetVector LoopInvariants; LoopInvariants.insert(&Plan.getVectorTripCount()); // We scan the loop in a topological order in order and assign a number to // each recipe. We use RPO to ensure that defs are met before their users. We // assume that each recipe that has in-loop users starts an interval. We // record every time that an in-loop value is used, so we have a list of the // first and last occurrences of each recipe. ReversePostOrderTraversal> RPOT( Plan.getVectorLoopRegion()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { if (!VPBB->getParent()) break; for (VPRecipeBase &R : *VPBB) { Idx2Recipe.push_back(&R); // Save the end location of each USE. for (VPValue *U : R.operands()) { auto *DefR = U->getDefiningRecipe(); // Ignore non-recipe values such as arguments, constants, etc. // FIXME: Might need some motivation why these values are ignored. If // for example an argument is used inside the loop it will increase the // register pressure (so shouldn't we add it to LoopInvariants). if (!DefR && (!U->getLiveInIRValue() || !isa(U->getLiveInIRValue()))) continue; // If this recipe is outside the loop then record it and continue. if (!DefR) { LoopInvariants.insert(U); continue; } // Overwrite previous end points. EndPoint[DefR] = Idx2Recipe.size(); Ends.insert(DefR); } } if (VPBB == Plan.getVectorLoopRegion()->getExiting()) { // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the // exiting block, where their increment will get materialized eventually. for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { if (isa(&R)) { EndPoint[&R] = Idx2Recipe.size(); Ends.insert(&R); } } } } // Saves the list of intervals that end with the index in 'key'. using RecipeList = SmallVector; SmallDenseMap TransposeEnds; // Next, we transpose the EndPoints into a multi map that holds the list of // intervals that *end* at a specific location. for (auto &Interval : EndPoint) TransposeEnds[Interval.second].push_back(Interval.first); SmallPtrSet OpenIntervals; SmallVector RUs(VFs.size()); SmallVector, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); const auto &TTICapture = TTI; auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || (VF.isScalable() && !TTICapture.isElementTypeLegalForScalableVector(Ty))) return 0; return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; // We scan the instructions linearly and record each time that a new interval // starts, by placing it in a set. If we find this value in TransposEnds then // we remove it from the set. The max register usage is the maximum register // usage of the recipes of the set. for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { VPRecipeBase *R = Idx2Recipe[Idx]; // Remove all of the recipes that end at this location. RecipeList &List = TransposeEnds[Idx]; for (VPRecipeBase *ToRemove : List) OpenIntervals.erase(ToRemove); // Ignore recipes that are never used within the loop and do not have side // effects. if (!Ends.count(R) && !R->mayHaveSideEffects()) continue; // Skip recipes for ignored values. // TODO: Should mark recipes for ephemeral values that cannot be removed // explictly in VPlan. if (isa(R) && ValuesToIgnore.contains( cast(R)->getUnderlyingValue())) continue; // For each VF find the maximum usage of registers. for (unsigned J = 0, E = VFs.size(); J < E; ++J) { // Count the number of registers used, per register class, given all open // intervals. // Note that elements in this SmallMapVector will be default constructed // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if // there is no previous entry for ClassID. SmallMapVector RegUsage; for (auto *R : OpenIntervals) { // Skip recipes that weren't present in the original loop. // TODO: Remove after removing the legacy // LoopVectorizationCostModel::calculateRegisterUsage if (isa(R)) continue; if (VFs[J].isScalar() || isa(R) || (isa(R) && all_of(cast(R)->users(), [&](VPUser *U) { return cast(U)->usesScalars( R->getVPSingleValue()); })) || (isa(R) && (cast(R))->isInLoop())) { unsigned ClassID = TTI.getRegisterClassForType( false, TypeInfo.inferScalarType(R->getVPSingleValue())); // FIXME: The target might use more than one register for the type // even in the scalar case. RegUsage[ClassID] += 1; } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. unsigned ScaleFactor = getVFScaleFactor(R); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for " << *R << "\n"; }); for (VPValue *DefV : R->definedValues()) { Type *ScalarTy = TypeInfo.inferScalarType(DefV); unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); } } } for (const auto &Pair : RegUsage) { auto &Entry = MaxUsages[J][Pair.first]; Entry = std::max(Entry, Pair.second); } } LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " << OpenIntervals.size() << '\n'); // Add the current recipe to the list of open intervals. OpenIntervals.insert(R); } // We also search for instructions that are defined outside the loop, but are // used inside the loop. We need this number separately from the max-interval // usage number because when we unroll, loop-invariant values do not take // more register. VPRegisterUsage RU; for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { // Note that elements in this SmallMapVector will be default constructed // as 0. So we can use "Invariant[ClassID] += n" in the code below even if // there is no previous entry for ClassID. SmallMapVector Invariant; for (auto *In : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. bool IsScalar = all_of(In->users(), [&](VPUser *U) { return cast(U)->usesScalars(In); }); ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; unsigned ClassID = TTI.getRegisterClassForType( VF.isVector(), TypeInfo.inferScalarType(In)); Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF); } LLVM_DEBUG({ dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() << " item\n"; for (const auto &pair : MaxUsages[Idx]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[Idx]; RUs[Idx] = RU; } return RUs; }