diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 44 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 19 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 56 |
8 files changed, 96 insertions, 47 deletions
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index b5548d4..8c8d16a6 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1944,6 +1944,10 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::ProtectedVisibility); + } NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index a577f51..4a7144f 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -78,11 +78,16 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { SmallVector<OperandBundleDef> KeptBundles; unsigned NumBundles = Assume->getNumOperandBundles(); for (unsigned I = 0; I != NumBundles; ++I) { - auto IsDead = [](OperandBundleUse Bundle) { + auto IsDead = [&](OperandBundleUse Bundle) { // "ignore" operand bundles are always dead. if (Bundle.getTagName() == "ignore") return true; + // "dereferenceable" operand bundles are only dropped if requested + // (e.g., after loop vectorization has run). + if (Bundle.getTagName() == "dereferenceable") + return DropDereferenceable; + // Bundles without arguments do not affect any specific values. // Always keep them for now. if (Bundle.Inputs.empty()) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 45b5570..566d6ea 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1232,6 +1232,30 @@ public: /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) const; + /// A helper function that returns how much we should divide the cost of a + /// predicated block by. Typically this is the reciprocal of the block + /// probability, i.e. if we return X we are assuming the predicated block will + /// execute once for every X iterations of the loop header so the block should + /// only contribute 1/X of its cost to the total cost calculation, but when + /// optimizing for code size it will just be 1 as code size costs don't depend + /// on execution probabilities. + /// + /// TODO: We should use actual block probability here, if available. + /// Currently, we always assume predicated blocks have a 50% chance of + /// executing, apart from blocks that are only predicated due to tail folding. + inline unsigned + getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, + BasicBlock *BB) const { + // If a block wasn't originally predicated but was predicated due to + // e.g. tail folding, don't divide the cost. Tail folded loops may still be + // predicated in the final vector loop iteration, but for most loops that + // don't have low trip counts we can expect their probability to be close to + // zero. + if (!Legal->blockNeedsPredication(BB)) + return 1; + return CostKind == TTI::TCK_CodeSize ? 1 : 2; + } + /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable @@ -2887,7 +2911,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); + ScalarizationCost = + ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent()); } InstructionCost SafeDivisorCost = 0; @@ -5032,7 +5057,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( } // Scale the total scalar cost by block probability. - ScalarCost /= getPredBlockCostDivisor(CostKind); + ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -5082,10 +5107,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute // the predicated block, if it is an if-else block. Thus, scale the block's - // cost by the probability of executing it. blockNeedsPredication from - // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost /= getPredBlockCostDivisor(CostKind); + // cost by the probability of executing it. + // getPredBlockCostDivisor will return 1 for blocks that are only predicated + // by the header mask when folding the tail. + if (VF.isScalar()) + BlockCost /= getPredBlockCostDivisor(CostKind, BB); Cost += BlockCost; } @@ -5164,7 +5190,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { - Cost /= getPredBlockCostDivisor(CostKind); + Cost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Add the cost of an i1 extract and a branch auto *VecI1Ty = @@ -6732,6 +6758,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { SkipCostComputation.contains(UI); } +unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const { + return CM.getPredBlockCostDivisor(CostKind, BB); +} + InstructionCost LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e4303a..90696ff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -99,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) VPValue::~VPValue() { assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) + if (VPDef *Def = getDefiningRecipe()) Def->removeDefinedValue(this); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) + if (const VPRecipeBase *R = getDefiningRecipe()) R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); + const VPRecipeBase *Instr = getDefiningRecipe(); VPSlotTracker SlotTracker( (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); print(dbgs(), SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5851b3a..72858e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -959,6 +959,11 @@ public: /// Add metadata with kind \p Kind and \p Node. void addMetadata(unsigned Kind, MDNode *Node) { + assert(none_of(Metadata, + [Kind](const std::pair<unsigned, MDNode *> &P) { + return P.first == Kind; + }) && + "Kind must appear at most once in Metadata"); Metadata.emplace_back(Kind, Node); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 965426f..caabfa7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -/// A helper function that returns how much we should divide the cost of a -/// predicated block by. Typically this is the reciprocal of the block -/// probability, i.e. if we return X we are assuming the predicated block will -/// execute once for every X iterations of the loop header so the block should -/// only contribute 1/X of its cost to the total cost calculation, but when -/// optimizing for code size it will just be 1 as code size costs don't depend -/// on execution probabilities. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -inline unsigned -getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) { - return CostKind == TTI::TCK_CodeSize ? 1 : 2; -} - /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 16) = {1, 2, 4, 8} @@ -367,6 +352,10 @@ struct VPCostContext { /// has already been pre-computed. bool skipCostComputation(Instruction *UI, bool IsVector) const; + /// \returns how much the cost of a predicated block should be divided by. + /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor. + unsigned getPredBlockCostDivisor(BasicBlock *BB) const; + /// Returns the OperandInfo for \p V, if it is a live-in. TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112..707886f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3349,7 +3349,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind); + ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent()); return ScalarCost; } case Instruction::Load: diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 634df51..eab6426 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1420,10 +1420,26 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { - return U->usesScalars(RepOrWidenR) || - match(cast<VPRecipeBase>(U), - m_CombineOr(m_ExtractLastElement(m_VPValue()), - m_ExtractLastLanePerPart(m_VPValue()))); + if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) { + // VPWidenStore doesn't have users, and stores are always + // profitable to widen: hence, permitting single-scalar stored + // values is an important leaf condition. The assert must hold as + // we checked the RepOrWidenR operand against + // vputils::isSingleScalar. + assert(RepOrWidenR == Store->getAddr() || + vputils::isSingleScalar(Store->getStoredValue())); + return true; + } + + if (auto *VPI = dyn_cast<VPInstruction>(U)) { + unsigned Opcode = VPI->getOpcode(); + if (Opcode == VPInstruction::ExtractLastElement || + Opcode == VPInstruction::ExtractLastLanePerPart || + Opcode == VPInstruction::ExtractPenultimateElement) + return true; + } + + return U->usesScalars(RepOrWidenR); })) continue; @@ -1745,17 +1761,17 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, if (match(Term, m_BranchOnCount()) || match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( m_VPValue(), m_VPValue(), m_VPValue()))))) { - // Try to simplify the branch condition if TC <= VF * UF when the latch - // terminator is BranchOnCount or BranchOnCond where the input is - // Not(ActiveLaneMask). - const SCEV *TripCount = - vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TripCount) && + // Try to simplify the branch condition if VectorTC <= VF * UF when the + // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)). + const SCEV *VectorTripCount = + vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE); + if (isa<SCEVCouldNotCompute>(VectorTripCount)) + VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(VectorTripCount) && "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); - const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); - if (TripCount->isZero() || - !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) + const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements); + if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C)) return false; } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) { // For BranchOnCond, check if we can prove the condition to be true using VF @@ -4131,13 +4147,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { /// is defined at \p Idx of a load interleave group. static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx) { - auto *DefR = OpV->getDefiningRecipe(); - if (!DefR) - return WideMember0->getOperand(OpIdx) == OpV; - if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR)) - return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; - - if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR)) + VPValue *Member0Op = WideMember0->getOperand(OpIdx); + VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe(); + if (!Member0OpR) + return Member0Op == OpV; + if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR)) + return !W->getMask() && Member0Op == OpV; + if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR)) return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } |
