diff options
Diffstat (limited to 'llvm/lib/Transforms/IPO')
| -rw-r--r-- | llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 84 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/ExpandVariadics.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/GlobalOpt.cpp | 299 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/IROutliner.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 17 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 266 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 42 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/SampleProfile.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 66 |
10 files changed, 604 insertions, 190 deletions
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5048561..074d8ed 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final return true; RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,35 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + case Intrinsic::amdgcn_make_buffer_rsrc: { + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5529,56 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + // FIXME: Should introduce target specific sub-attributes and letting + // getAAfor<AAAlign> lead to create sub-attribute to handle target + // specific intrinsics. + case Intrinsic::amdgcn_make_buffer_rsrc: { + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA) + return clampStateAndIndicateChange<StateType>( + this->getState(), AlignAA->getAssumedAlign().value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; @@ -10701,7 +10781,7 @@ struct AAInterFnReachabilityFunction auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this); RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index 6a11aec..4863d6b 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -422,6 +422,13 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder, assert(VariadicWrapperDefine == VariadicWrapper); assert(!VariadicWrapper->isDeclaration()); + // Add the prof metadata from the original function to the wrapper. Because + // FixedArityReplacement is the owner of original function's prof metadata + // after the splice, we need to transfer it to VariadicWrapper. + VariadicWrapper->setMetadata( + LLVMContext::MD_prof, + FixedArityReplacement->getMetadata(LLVMContext::MD_prof)); + // We now have: // 1. the original function, now as a declaration with no uses // 2. a variadic function that unconditionally calls a fixed arity replacement diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 99c4982..66b95fc 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -99,6 +99,11 @@ static cl::opt<bool> "functions from non-versioned callers."), cl::init(true), cl::Hidden); +static cl::opt<unsigned> MaxIFuncVersions( + "max-ifunc-versions", cl::Hidden, cl::init(5), + cl::desc("Maximum number of caller/callee versions that is allowed for " + "using the expensive (cubic) static resolution algorithm.")); + static cl::opt<bool> EnableColdCCStressTest("enable-coldcc-stress-test", cl::desc("Enable stress test of coldcc by adding " @@ -2018,12 +2023,15 @@ OptimizeFunctions(Module &M, if (hasChangeableCC(&F, ChangeableCCCache)) { // If this function has a calling convention worth changing, is not a - // varargs function, and is only called directly, promote it to use the - // Fast calling convention. - F.setCallingConv(CallingConv::Fast); - ChangeCalleesToFastCall(&F); - ++NumFastCallFns; - Changed = true; + // varargs function, is only called directly, and is supported by the + // target, promote it to use the Fast calling convention. + TargetTransformInfo &TTI = GetTTI(F); + if (TTI.useFastCCForInternalCall(F)) { + F.setCallingConv(CallingConv::Fast); + ChangeCalleesToFastCall(&F); + ++NumFastCallFns; + Changed = true; + } } if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) && @@ -2488,20 +2496,21 @@ DeleteDeadIFuncs(Module &M, // Follows the use-def chain of \p V backwards until it finds a Function, // in which case it collects in \p Versions. Return true on successful // use-def chain traversal, false otherwise. -static bool collectVersions(TargetTransformInfo &TTI, Value *V, - SmallVectorImpl<Function *> &Versions) { +static bool +collectVersions(Value *V, SmallVectorImpl<Function *> &Versions, + function_ref<TargetTransformInfo &(Function &)> GetTTI) { if (auto *F = dyn_cast<Function>(V)) { - if (!TTI.isMultiversionedFunction(*F)) + if (!GetTTI(*F).isMultiversionedFunction(*F)) return false; Versions.push_back(F); } else if (auto *Sel = dyn_cast<SelectInst>(V)) { - if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + if (!collectVersions(Sel->getTrueValue(), Versions, GetTTI)) return false; - if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + if (!collectVersions(Sel->getFalseValue(), Versions, GetTTI)) return false; } else if (auto *Phi = dyn_cast<PHINode>(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) - if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + if (!collectVersions(Phi->getIncomingValue(I), Versions, GetTTI)) return false; } else { // Unknown instruction type. Bail. @@ -2510,31 +2519,45 @@ static bool collectVersions(TargetTransformInfo &TTI, Value *V, return true; } -// Bypass the IFunc Resolver of MultiVersioned functions when possible. To -// deduce whether the optimization is legal we need to compare the target -// features between caller and callee versions. The criteria for bypassing -// the resolver are the following: -// -// * If the callee's feature set is a subset of the caller's feature set, -// then the callee is a candidate for direct call. -// -// * Among such candidates the one of highest priority is the best match -// and it shall be picked, unless there is a version of the callee with -// higher priority than the best match which cannot be picked from a -// higher priority caller (directly or through the resolver). -// -// * For every higher priority callee version than the best match, there -// is a higher priority caller version whose feature set availability -// is implied by the callee's feature set. +// Try to statically resolve calls to versioned functions when possible. First +// we identify the function versions which are associated with an IFUNC symbol. +// We do that by examining the resolver function of the IFUNC. Once we have +// collected all the function versions, we sort them in decreasing priority +// order. This is necessary for determining the most suitable callee version +// for each caller version. We then collect all the callsites to versioned +// functions. The static resolution is performed by comparing the feature sets +// between callers and callees. Specifically: +// * Start a walk over caller and callee lists simultaneously in order of +// decreasing priority. +// * Statically resolve calls from the current caller to the current callee, +// iff the caller feature bits are a superset of the callee feature bits. +// * For FMV callers, as long as the caller feature bits are a subset of the +// callee feature bits, advance to the next callee. This effectively prevents +// considering the current callee as a candidate for static resolution by +// following callers (explanation: preceding callers would not have been +// selected in a hypothetical runtime execution). +// * Advance to the next caller. // +// Presentation in EuroLLVM2025: +// https://www.youtube.com/watch?v=k54MFimPz-A&t=867s static bool OptimizeNonTrivialIFuncs( Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) { bool Changed = false; - // Cache containing the mask constructed from a function's target features. + // Map containing the feature bits for a given function. DenseMap<Function *, APInt> FeatureMask; + // Map containing the priority bits for a given function. + DenseMap<Function *, APInt> PriorityMask; + // Map containing all the function versions corresponding to an IFunc symbol. + DenseMap<GlobalIFunc *, SmallVector<Function *>> VersionedFuncs; + // Map containing the IFunc symbol a function is version of. + DenseMap<Function *, GlobalIFunc *> VersionOf; + // List of all the interesting IFuncs found in the module. + SmallVector<GlobalIFunc *> IFuncs; for (GlobalIFunc &IF : M.ifuncs()) { + LLVM_DEBUG(dbgs() << "Examining IFUNC " << IF.getName() << "\n"); + if (IF.isInterposable()) continue; @@ -2545,107 +2568,175 @@ static bool OptimizeNonTrivialIFuncs( if (Resolver->isInterposable()) continue; - TargetTransformInfo &TTI = GetTTI(*Resolver); - - // Discover the callee versions. - SmallVector<Function *> Callees; - if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + SmallVector<Function *> Versions; + // Discover the versioned functions. + if (any_of(*Resolver, [&](BasicBlock &BB) { if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator())) - if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + if (!collectVersions(Ret->getReturnValue(), Versions, GetTTI)) return true; return false; })) continue; - if (Callees.empty()) + if (Versions.empty()) continue; - LLVM_DEBUG(dbgs() << "Statically resolving calls to function " - << Resolver->getName() << "\n"); - - // Cache the feature mask for each callee. - for (Function *Callee : Callees) { - auto [It, Inserted] = FeatureMask.try_emplace(Callee); - if (Inserted) - It->second = TTI.getFeatureMask(*Callee); + for (Function *V : Versions) { + VersionOf.insert({V, &IF}); + auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(V); + if (FeatInserted) + FeatIt->second = GetTTI(*V).getFeatureMask(*V); + auto [PriorIt, PriorInserted] = PriorityMask.try_emplace(V); + if (PriorInserted) + PriorIt->second = GetTTI(*V).getPriorityMask(*V); } - // Sort the callee versions in decreasing priority order. - sort(Callees, [&](auto *LHS, auto *RHS) { - return FeatureMask[LHS].ugt(FeatureMask[RHS]); + // Sort function versions in decreasing priority order. + sort(Versions, [&](auto *LHS, auto *RHS) { + return PriorityMask[LHS].ugt(PriorityMask[RHS]); }); - // Find the callsites and cache the feature mask for each caller. - SmallVector<Function *> Callers; + IFuncs.push_back(&IF); + VersionedFuncs.try_emplace(&IF, std::move(Versions)); + } + + for (GlobalIFunc *CalleeIF : IFuncs) { + SmallVector<Function *> NonFMVCallers; + DenseSet<GlobalIFunc *> CallerIFuncs; DenseMap<Function *, SmallVector<CallBase *>> CallSites; - for (User *U : IF.users()) { + + // Find the callsites. + for (User *U : CalleeIF->users()) { if (auto *CB = dyn_cast<CallBase>(U)) { - if (CB->getCalledOperand() == &IF) { + if (CB->getCalledOperand() == CalleeIF) { Function *Caller = CB->getFunction(); - auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); - if (FeatInserted) - FeatIt->second = TTI.getFeatureMask(*Caller); - auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); - if (CallInserted) - Callers.push_back(Caller); - CallIt->second.push_back(CB); + GlobalIFunc *CallerIF = nullptr; + TargetTransformInfo &TTI = GetTTI(*Caller); + bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller); + // The caller is a version of a known IFunc. + if (auto It = VersionOf.find(Caller); It != VersionOf.end()) + CallerIF = It->second; + else if (!CallerIsFMV && OptimizeNonFMVCallers) { + // The caller is non-FMV. + auto [It, Inserted] = FeatureMask.try_emplace(Caller); + if (Inserted) + It->second = TTI.getFeatureMask(*Caller); + } else + // The caller is none of the above, skip. + continue; + auto [It, Inserted] = CallSites.try_emplace(Caller); + if (Inserted) { + if (CallerIsFMV) + CallerIFuncs.insert(CallerIF); + else + NonFMVCallers.push_back(Caller); + } + It->second.push_back(CB); } } } - // Sort the caller versions in decreasing priority order. - sort(Callers, [&](auto *LHS, auto *RHS) { - return FeatureMask[LHS].ugt(FeatureMask[RHS]); - }); + if (CallSites.empty()) + continue; - auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); }; - - // Index to the highest priority candidate. - unsigned I = 0; - // Now try to redirect calls starting from higher priority callers. - for (Function *Caller : Callers) { - assert(I < Callees.size() && "Found callers of equal priority"); - - Function *Callee = Callees[I]; - APInt CallerBits = FeatureMask[Caller]; - APInt CalleeBits = FeatureMask[Callee]; - - // In the case of FMV callers, we know that all higher priority callers - // than the current one did not get selected at runtime, which helps - // reason about the callees (if they have versions that mandate presence - // of the features which we already know are unavailable on this target). - if (TTI.isMultiversionedFunction(*Caller)) { - // If the feature set of the caller implies the feature set of the - // highest priority candidate then it shall be picked. In case of - // identical sets advance the candidate index one position. - if (CallerBits == CalleeBits) - ++I; - else if (!implies(CallerBits, CalleeBits)) { - // Keep advancing the candidate index as long as the caller's - // features are a subset of the current candidate's. - while (implies(CalleeBits, CallerBits)) { - if (++I == Callees.size()) - break; - CalleeBits = FeatureMask[Callees[I]]; + LLVM_DEBUG(dbgs() << "Statically resolving calls to function " + << CalleeIF->getResolverFunction()->getName() << "\n"); + + // The complexity of this algorithm is linear: O(NumCallers + NumCallees) + // if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions, + // otherwise it is cubic: O((NumCallers ^ 2) x NumCallees). + auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers, + ArrayRef<Function *> Callees, + bool CallerIsFMV) { + bool AllowExpensiveChecks = CallerIsFMV && + Callers.size() <= MaxIFuncVersions && + Callees.size() <= MaxIFuncVersions; + // Index to the highest callee candidate. + unsigned J = 0; + + for (unsigned I = 0, E = Callers.size(); I < E; ++I) { + // There are no callee candidates left. + if (J == Callees.size()) + break; + + Function *Caller = Callers[I]; + APInt CallerBits = FeatureMask[Caller]; + + // Compare the feature bits of the best callee candidate with all the + // caller versions preceeding the current one. For each prior caller + // discard feature bits that are known to be available in the current + // caller. As long as the known missing feature bits are a subset of the + // callee feature bits, advance to the next callee and start over. + auto eliminateAvailableFeatures = [&](unsigned BestCandidate) { + unsigned K = 0; + while (K < I && BestCandidate < Callees.size()) { + APInt MissingBits = FeatureMask[Callers[K]] & ~CallerBits; + if (MissingBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) { + ++BestCandidate; + // Start over. + K = 0; + } else + ++K; } + return BestCandidate; + }; + + unsigned BestCandidate = + AllowExpensiveChecks ? eliminateAvailableFeatures(J) : J; + // No callee candidate was found for this caller. + if (BestCandidate == Callees.size()) continue; + + LLVM_DEBUG(dbgs() << " Examining " + << (CallerIsFMV ? "FMV" : "regular") << " caller " + << Caller->getName() << "\n"); + + Function *Callee = Callees[BestCandidate]; + APInt CalleeBits = FeatureMask[Callee]; + + // Statically resolve calls from the current caller to the current + // callee, iff the caller feature bits are a superset of the callee + // feature bits. + if (CalleeBits.isSubsetOf(CallerBits)) { + // Not all caller versions are necessarily users of the callee IFUNC. + if (auto It = CallSites.find(Caller); It != CallSites.end()) { + for (CallBase *CS : It->second) { + LLVM_DEBUG(dbgs() << " Redirecting call " << Caller->getName() + << " -> " << Callee->getName() << "\n"); + CS->setCalledOperand(Callee); + } + Changed = true; + } } - } else { - // We can't reason much about non-FMV callers. Just pick the highest - // priority callee if it matches, otherwise bail. - if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits)) + + // Nothing else to do about non-FMV callers. + if (!CallerIsFMV) continue; + + // For FMV callers, as long as the caller feature bits are a subset of + // the callee feature bits, advance to the next callee. This effectively + // prevents considering the current callee as a candidate for static + // resolution by following callers. + while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) && + ++J < Callees.size()) + ; } - auto &Calls = CallSites[Caller]; - for (CallBase *CS : Calls) { - LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> " - << Callee->getName() << "\n"); - CS->setCalledOperand(Callee); - } - Changed = true; + }; + + auto &Callees = VersionedFuncs[CalleeIF]; + + // Optimize non-FMV calls. + if (OptimizeNonFMVCallers) + staticallyResolveCalls(NonFMVCallers, Callees, /*CallerIsFMV=*/false); + + // Optimize FMV calls. + for (GlobalIFunc *CallerIF : CallerIFuncs) { + auto &Callers = VersionedFuncs[CallerIF]; + staticallyResolveCalls(Callers, Callees, /*CallerIsFMV=*/true); } - if (IF.use_empty() || - all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); })) + + if (CalleeIF->use_empty() || + all_of(CalleeIF->users(), [](User *U) { return isa<GlobalAlias>(U); })) NumIFuncsResolved++; } return Changed; diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index fdf0c3a..6e1ca9c 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -2108,8 +2108,7 @@ static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap, for (Value *RetVal : SortedKeys) { BasicBlock *NewBB = BasicBlock::Create( - ParentFunc->getContext(), - Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)), + ParentFunc->getContext(), Twine(BaseName) + Twine("_") + Twine(Idx++), ParentFunc); NewMap.insert(std::make_pair(RetVal, NewBB)); } @@ -2286,9 +2285,9 @@ void IROutliner::deduplicateExtractedSections( // Create a set of BasicBlocks, one for each return block, to hold the // needed store instructions. DenseMap<Value *, BasicBlock *> NewBBs; - createAndInsertBasicBlocks( - CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction, - "output_block_" + Twine(static_cast<unsigned>(Idx))); + createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs, + CurrentGroup.OutlinedFunction, + "output_block_" + Twine(Idx)); replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings); alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs, CurrentGroup.EndBBs, OutputMappings, diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index aa1346d..06deea8 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -48,12 +48,14 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -78,7 +80,6 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <memory> #include <set> #include <string> #include <system_error> @@ -803,7 +804,9 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, return createBitSetTest(ThenB, TIL, BitOffset); } - IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false)); + MDBuilder MDB(M.getContext()); + IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false, + MDB.createLikelyBranchWeights())); // Now that we know that the offset is in range and aligned, load the // appropriate bit from the bitset. @@ -1470,6 +1473,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( Constant::getNullValue(F->getType())); Value *Select = Builder.CreateSelect(ICmp, JT, Constant::getNullValue(F->getType())); + + if (auto *SI = dyn_cast<SelectInst>(Select)) + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); // For phi nodes, we need to update the incoming value for all operands // with the same predecessor. if (PN) @@ -1552,12 +1558,7 @@ void LowerTypeTestsModule::createJumpTable( // Align the whole table by entry size. F->setAlignment(Align(getJumpTableEntrySize(JumpTableArch))); - // Skip prologue. - // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3. - // Luckily, this function does not get any prologue even without the - // attribute. - if (OS != Triple::Win32) - F->addFnAttr(Attribute::Naked); + F->addFnAttr(Attribute::Naked); if (JumpTableArch == Triple::arm) F->addFnAttr("target-features", "-thumb-mode"); if (JumpTableArch == Triple::thumb) { diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83f..0f4bc64 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments, STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes"); STATISTIC(TotalMergeIters, "Number of merge iterations for nodes"); STATISTIC(MaxMergeIters, "Max merge iterations for nodes"); +STATISTIC(NumImportantContextIds, "Number of important context ids"); +STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted"); +STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added"); +STATISTIC(NumFixedContexts, "Number of contexts with fixed edges"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion( extern cl::opt<bool> MemProfReportHintedSizes; extern cl::opt<unsigned> MinClonedColdBytePercent; +cl::opt<unsigned> MemProfTopNImportant( + "memprof-top-n-important", cl::init(10), cl::Hidden, + cl::desc("Number of largest cold contexts to consider important")); + +cl::opt<bool> MemProfFixupImportant( + "memprof-fixup-important", cl::init(true), cl::Hidden, + cl::desc("Enables edge fixup for important contexts")); + } // namespace llvm namespace { + /// CRTP base for graphs built from either IR or ThinLTO summary index. /// /// The graph represents the call contexts in all memprof metadata on allocation @@ -581,17 +594,26 @@ protected: /// Adds nodes for the given MIB stack ids. template <class NodeT, class IteratorT> - void addStackNodesForMIB(ContextNode *AllocNode, - CallStack<NodeT, IteratorT> &StackContext, - CallStack<NodeT, IteratorT> &CallsiteContext, - AllocationType AllocType, - ArrayRef<ContextTotalSize> ContextSizeInfo); + void addStackNodesForMIB( + ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, + CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, + ArrayRef<ContextTotalSize> ContextSizeInfo, + std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold); /// Matches all callsite metadata (or summary) to the nodes created for /// allocation memprof MIB metadata, synthesizing new nodes to reflect any /// inlining performed on those callsite instructions. void updateStackNodes(); + /// Optionally fixup edges for the N largest cold contexts to better enable + /// cloning. This is particularly helpful if the context includes recursion + /// as well as inlining, resulting in a single stack node for multiple stack + /// ids in the context. With recursion it is particularly difficult to get the + /// edge updates correct as in the general case we have lost the original + /// stack id ordering for the context. Do more expensive fixup for the largest + /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant. + void fixupImportantContexts(); + /// Update graph to conservatively handle any callsite stack nodes that target /// multiple different callee target functions. void handleCallsitesWithMultipleTargets(); @@ -658,7 +680,8 @@ private: void assignStackNodesPostOrder( ContextNode *Node, DenseSet<const ContextNode *> &Visited, DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls, - DenseMap<CallInfo, CallInfo> &CallToMatchingCall); + DenseMap<CallInfo, CallInfo> &CallToMatchingCall, + const DenseSet<uint32_t> &ImportantContextIds); /// Duplicates the given set of context ids, updating the provided /// map from each original id with the newly generated context ids, @@ -859,6 +882,50 @@ private: /// nodes. DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; + /// Saves information for the contexts identified as important (the largest + /// cold contexts up to MemProfTopNImportant). + struct ImportantContextInfo { + // The original list of leaf first stack ids corresponding to this context. + std::vector<uint64_t> StackIds; + // Max length of stack ids corresponding to a single stack ContextNode for + // this context (i.e. the max length of a key in StackIdsToNode below). + unsigned MaxLength = 0; + // Mapping of slices of the stack ids to the corresponding ContextNode + // (there can be multiple stack ids due to inlining). Populated when + // updating stack nodes while matching them to the IR or summary. + std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode; + }; + + // Map of important full context ids to information about each. + DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo; + + // For each important context id found in Node (if any), records the list of + // stack ids that corresponded to the given callsite Node. There can be more + // than one in the case of inlining. + void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node, + // We pass in the Node's context ids to avoid the + // overhead of computing them as the caller already has + // them in some cases. + const DenseSet<uint32_t> &NodeContextIds, + const DenseSet<uint32_t> &ImportantContextIds) { + if (!MemProfTopNImportant) { + assert(ImportantContextIds.empty()); + return; + } + DenseSet<uint32_t> Ids = + set_intersection(NodeContextIds, ImportantContextIds); + if (Ids.empty()) + return; + auto Size = StackIds.size(); + for (auto Id : Ids) { + auto &Entry = ImportantContextIdInfo[Id]; + Entry.StackIdsToNode[StackIds] = Node; + // Keep track of the max to simplify later analysis. + if (Size > Entry.MaxLength) + Entry.MaxLength = Size; + } + } + /// Maps to track the calls to their corresponding nodes in the graph. MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; @@ -1034,11 +1101,11 @@ private: } // namespace template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> @@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, - ArrayRef<ContextTotalSize> ContextSizeInfo) { + ArrayRef<ContextTotalSize> ContextSizeInfo, + std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) { // Treating the hot alloc type as NotCold before the disambiguation for "hot" // is done. if (AllocType == AllocationType::Hot) @@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; + bool IsImportant = false; if (!ContextSizeInfo.empty()) { auto &Entry = ContextIdToContextSizeInfos[LastContextId]; + // If this is a cold allocation, and we are collecting non-zero largest + // contexts, see if this is a candidate. + if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) { + uint64_t TotalCold = 0; + for (auto &CSI : ContextSizeInfo) + TotalCold += CSI.TotalSize; + // Record this context if either we haven't found the first top-n largest + // yet, or if it is larger than the smallest already recorded. + if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant || + // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly + // sorted in ascending size of its key which is the size. + TotalCold > TotalSizeToContextIdTopNCold.begin()->first) { + if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) { + // Remove old one and its associated entries. + auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second; + TotalSizeToContextIdTopNCold.erase( + TotalSizeToContextIdTopNCold.begin()); + assert(ImportantContextIdInfo.count(IdToRemove)); + ImportantContextIdInfo.erase(IdToRemove); + } + TotalSizeToContextIdTopNCold[TotalCold] = LastContextId; + IsImportant = true; + } + } Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); } @@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); ContextIter != StackContext.end(); ++ContextIter) { auto StackId = getStackId(*ContextIter); + if (IsImportant) + ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId); ContextNode *StackNode = getNodeForStackId(StackId); if (!StackNode) { StackNode = createNewNode(/*IsAllocation=*/false); @@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: - assignStackNodesPostOrder( - ContextNode *Node, DenseSet<const ContextNode *> &Visited, - DenseMap<uint64_t, std::vector<CallContextInfo>> - &StackIdToMatchingCalls, - DenseMap<CallInfo, CallInfo> &CallToMatchingCall) { + assignStackNodesPostOrder(ContextNode *Node, + DenseSet<const ContextNode *> &Visited, + DenseMap<uint64_t, std::vector<CallContextInfo>> + &StackIdToMatchingCalls, + DenseMap<CallInfo, CallInfo> &CallToMatchingCall, + const DenseSet<uint32_t> &ImportantContextIds) { auto Inserted = Visited.insert(Node); if (!Inserted.second) return; @@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: continue; } assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls, - CallToMatchingCall); + CallToMatchingCall, ImportantContextIds); } // If this node's stack id is in the map, update the graph to contain new @@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: Node->setCall(Call); NonAllocationCallToContextNodeMap[Call] = Node; NodeToCallingFunc[Node] = Func; + recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds); return; } } @@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: : CurNode->computeAllocType(); PrevNode = CurNode; } + + recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds); + if (VerifyNodes) { checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); for (auto Id : Ids) { @@ -1799,6 +1899,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: } template <typename DerivedCCG, typename FuncTy, typename CallTy> +void CallsiteContextGraph<DerivedCCG, FuncTy, + CallTy>::fixupImportantContexts() { + if (ImportantContextIdInfo.empty()) + return; + + // Update statistics as we are done building this map at this point. + NumImportantContextIds = ImportantContextIdInfo.size(); + + if (!MemProfFixupImportant) + return; + + if (ExportToDot) + exportToDot("beforestackfixup"); + + // For each context we identified as important, walk through the saved context + // stack ids in order from leaf upwards, and make sure all edges are correct. + // These can be difficult to get right when updating the graph while mapping + // nodes onto summary or IR, especially when there is recursion. In + // particular, when we have created new nodes to reflect inlining, it is + // sometimes impossible to know exactly how to update the edges in the face of + // recursion, as we have lost the original ordering of the stack ids in the + // contexts. + // TODO: Consider only doing this if we detect the context has recursive + // cycles. + // + // I.e. assume we have a context with stack ids like: {A B A C A D E} + // and let's say A was inlined into B, C, and D. The original graph will have + // multiple recursive cycles through A. When we match the original context + // nodes onto the IR or summary, we will merge {A B} into one context node, + // {A C} onto another, and {A D} onto another. Looking at the stack sequence + // above, we should end up with a non-cyclic set of edges like: + // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the + // original ordering, we won't get the edges correct initially (it's + // impossible without the original ordering). Here we do the fixup (add and + // removing edges where necessary) for this context. In the + // ImportantContextInfo struct in this case we should have a MaxLength = 2, + // and map entries for {A B}, {A C}, {A D}, and {E}. + for (auto &[CurContextId, Info] : ImportantContextIdInfo) { + if (Info.StackIdsToNode.empty()) + continue; + bool Changed = false; + ContextNode *PrevNode = nullptr; + ContextNode *CurNode = nullptr; + DenseSet<const ContextEdge *> VisitedEdges; + ArrayRef<uint64_t> AllStackIds(Info.StackIds); + // Try to identify what callsite ContextNode maps to which slice of the + // context's ordered stack ids. + for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) { + // We will do this greedily, trying up to MaxLength stack ids in a row, to + // see if we recorded a context node for that sequence. + auto Len = Info.MaxLength; + auto LenToEnd = AllStackIds.size() - I; + if (Len > LenToEnd) + Len = LenToEnd; + CurNode = nullptr; + // Try to find a recorded context node starting with the longest length + // recorded, and on down until we check for just a single stack node. + for (; Len > 0; Len--) { + // Get the slice of the original stack id sequence to check. + auto CheckStackIds = AllStackIds.slice(I, Len); + auto EntryIt = Info.StackIdsToNode.find(CheckStackIds); + if (EntryIt == Info.StackIdsToNode.end()) + continue; + CurNode = EntryIt->second; + // Skip forward so we don't try to look for the ones we just matched. + // We increment by Len - 1, because the outer for loop will increment I. + I += Len - 1; + break; + } + // Give up if we couldn't find a node. Since we need to clone from the + // leaf allocation upwards, no sense in doing anymore fixup further up + // the context if we couldn't match part of the original stack context + // onto a callsite node. + if (!CurNode) + break; + // No edges to fix up until we have a pair of nodes that should be + // adjacent in the graph. + if (!PrevNode) + continue; + // See if we already have a call edge from CurNode to PrevNode. + auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode); + if (CurEdge) { + // We already have an edge. Make sure it contains this context id. + if (CurEdge->getContextIds().insert(CurContextId).second) { + NumFixupEdgeIdsInserted++; + Changed = true; + } + } else { + // No edge exists - add one. + NumFixupEdgesAdded++; + DenseSet<uint32_t> ContextIds({CurContextId}); + auto AllocType = computeAllocType(ContextIds); + auto NewEdge = std::make_shared<ContextEdge>( + PrevNode, CurNode, AllocType, std::move(ContextIds)); + PrevNode->CallerEdges.push_back(NewEdge); + CurNode->CalleeEdges.push_back(NewEdge); + // Save the new edge for the below handling. + CurEdge = NewEdge.get(); + Changed = true; + } + VisitedEdges.insert(CurEdge); + // Now remove this context id from any other caller edges calling + // PrevNode. + for (auto &Edge : PrevNode->CallerEdges) { + // Skip the edge updating/created above and edges we have already + // visited (due to recursion). + if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get())) + Edge->getContextIds().erase(CurContextId); + } + } + if (Changed) + NumFixedContexts++; + } +} + +template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { // Map of stack id to all calls with that as the last (outermost caller) // callsite id that has a context node (some might not due to pruning @@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { // nodes representing any inlining at interior callsites. Note we move the // associated context ids over to the new nodes. DenseSet<const ContextNode *> Visited; + DenseSet<uint32_t> ImportantContextIds(llvm::from_range, + ImportantContextIdInfo.keys()); for (auto &Entry : AllocationCallToContextNodeMap) assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls, - CallToMatchingCall); + CallToMatchingCall, ImportantContextIds); + + fixupImportantContexts(); + if (VerifyCCG) check(); } @@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( Module &M, llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) : Mod(M), OREGetter(OREGetter) { + // Map for keeping track of the largest cold contexts up to the number given + // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys + // must be sorted. + std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold; for (auto &F : M) { std::vector<CallInfo> CallsWithMetadata; for (auto &BB : F) { @@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); addStackNodesForMIB<MDNode, MDNode::op_iterator>( AllocNode, StackContext, CallsiteContext, - getMIBAllocType(MIBMD), ContextSizeInfo); + getMIBAllocType(MIBMD), ContextSizeInfo, + TotalSizeToContextIdTopNCold); } // If exporting the graph to dot and an allocation id of interest was // specified, record all the context ids for this allocation node. @@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> isPrevailing) : Index(Index), isPrevailing(isPrevailing) { + // Map for keeping track of the largest cold contexts up to the number given + // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys + // must be sorted. + std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold; for (auto &I : Index) { auto VI = Index.getValueInfo(I); for (auto &S : VI.getSummaryList()) { @@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( } addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( AllocNode, StackContext, EmptyContext, MIB.AllocType, - ContextSizeInfo); + ContextSizeInfo, TotalSizeToContextIdTopNCold); I++; } // If exporting the graph to dot and an allocation id of interest was diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..1f52f9a 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); @@ -730,7 +730,7 @@ struct KernelInfoState : AbstractState { /// State to indicate if we can track parallel level of the associated /// function. We will give up tracking if we encounter unknown caller or the - /// caller is __kmpc_parallel_51. + /// caller is __kmpc_parallel_60. BooleanStateWithSetVector<uint8_t> ParallelLevels; /// Flag that indicates if the kernel has nested Parallelism @@ -2117,8 +2117,8 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { return getUniqueKernelFor(*CB); OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; - // Allow the use in __kmpc_parallel_51 calls. + OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60]; + // Allow the use in __kmpc_parallel_60 calls. if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI)) return getUniqueKernelFor(*CB); return nullptr; @@ -2145,7 +2145,7 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { bool OpenMPOpt::rewriteDeviceCodeStateMachine() { OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; + OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60]; bool Changed = false; if (!KernelParallelRFI) @@ -2157,7 +2157,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() { for (Function *F : SCC) { - // Check if the function is a use in a __kmpc_parallel_51 call at + // Check if the function is a use in a __kmpc_parallel_60 call at // all. bool UnknownUse = false; bool KernelParallelUse = false; @@ -2189,7 +2189,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() { UnknownUse = true; }); - // Do not emit a remark if we haven't seen a __kmpc_parallel_51 + // Do not emit a remark if we haven't seen a __kmpc_parallel_60 // use. if (!KernelParallelUse) continue; @@ -2208,7 +2208,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() { continue; } - // Even if we have __kmpc_parallel_51 calls, we (for now) give + // Even if we have __kmpc_parallel_60 calls, we (for now) give // up if the function is not called from a unique kernel. Kernel K = getUniqueKernelFor(*F); if (!K) { @@ -4457,7 +4457,7 @@ struct AAKernelInfoFunction : AAKernelInfo { Instruction *IsWorker = ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, - ConstantInt::get(KernelInitCB->getType(), -1), + ConstantInt::getAllOnesValue(KernelInitCB->getType()), "thread.is_worker", InitBB); IsWorker->setDebugLoc(DLoc); BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB); @@ -4821,8 +4821,8 @@ private: /// Update info regarding parallel levels. void updateParallelLevels(Attributor &A) { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); - OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; + OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60]; auto PredCallSite = [&](AbstractCallSite ACS) { Function *Caller = ACS.getInstruction()->getFunction(); @@ -4832,12 +4832,12 @@ private: auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller)); if (CAA && CAA->ParallelLevels.isValidState()) { - // Any function that is called by `__kmpc_parallel_51` will not be + // Any function that is called by `__kmpc_parallel_60` will not be // folded as the parallel level in the function is updated. In order to // get it right, all the analysis would depend on the implentation. That // said, if in the future any change to the implementation, the analysis // could be wrong. As a consequence, we are just conservative here. - if (Caller == Parallel51RFI.Declaration) { + if (Caller == Parallel60RFI.Declaration) { ParallelLevels.indicatePessimisticFixpoint(); return true; } @@ -5006,8 +5006,8 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_target_deinit: KernelDeinitCB = &CB; break; - case OMPRTL___kmpc_parallel_51: - if (!handleParallel51(A, CB)) + case OMPRTL___kmpc_parallel_60: + if (!handleParallel60(A, CB)) indicatePessimisticFixpoint(); return; case OMPRTL___kmpc_omp_task: @@ -5075,8 +5075,8 @@ struct AAKernelInfoCallSite : AAKernelInfo { return indicatePessimisticFixpoint(); CallBase &CB = cast<CallBase>(getAssociatedValue()); - if (It->getSecond() == OMPRTL___kmpc_parallel_51) { - if (!handleParallel51(A, CB)) + if (It->getSecond() == OMPRTL___kmpc_parallel_60) { + if (!handleParallel60(A, CB)) return indicatePessimisticFixpoint(); return StateBefore == getState() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; @@ -5136,9 +5136,9 @@ struct AAKernelInfoCallSite : AAKernelInfo { : ChangeStatus::CHANGED; } - /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was + /// Deal with a __kmpc_parallel_60 call (\p CB). Returns true if the call was /// handled, if a problem occurred, false is returned. - bool handleParallel51(Attributor &A, CallBase &CB) { + bool handleParallel60(Attributor &A, CallBase &CB) { const unsigned int NonWrapperFunctionArgNo = 5; const unsigned int WrapperFunctionArgNo = 6; auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed() diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index e39e311..8e76b79 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -83,7 +83,6 @@ #include <cstdint> #include <functional> #include <limits> -#include <map> #include <memory> #include <queue> #include <string> @@ -2293,7 +2292,6 @@ bool SampleProfileLoader::runOnFunction(Function &F, // count value. if (!F.getEntryCount()) F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); - std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) .getManager(); ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index 70b8614..b9fb7a3 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -18,6 +18,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/LongestCommonSequence.h" +#include <unordered_set> + using namespace llvm; using namespace sampleprof; diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 2dd0fde..7aa90ee 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -99,6 +99,7 @@ #include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -130,6 +131,8 @@ STATISTIC(NumUniqueRetVal, "Number of unique return value optimizations"); STATISTIC(NumVirtConstProp1Bit, "Number of 1 bit virtual constant propagations"); STATISTIC(NumVirtConstProp, "Number of virtual constant propagations"); +DEBUG_COUNTER(CallsToDevirt, "calls-to-devirt", + "Controls how many calls should be devirtualized."); namespace llvm { @@ -219,14 +222,6 @@ static cl::opt<bool> WholeProgramDevirtKeepUnreachableFunction( cl::desc("Regard unreachable functions as possible devirtualize targets."), cl::Hidden, cl::init(true)); -/// If explicitly specified, the devirt module pass will stop transformation -/// once the total number of devirtualizations reach the cutoff value. Setting -/// this option to 0 explicitly will do 0 devirtualization. -static cl::opt<unsigned> WholeProgramDevirtCutoff( - "wholeprogramdevirt-cutoff", - cl::desc("Max number of devirtualizations for devirt module pass"), - cl::init(0)); - /// Mechanism to add runtime checking of devirtualization decisions, optionally /// trapping or falling back to indirect call on any that are not correct. /// Trapping mode is useful for debugging undefined behavior leading to failures @@ -377,9 +372,6 @@ VirtualCallTarget::VirtualCallTarget(GlobalValue *Fn, const TypeMemberInfo *TM) namespace { -// Tracks the number of devirted calls in the IR transformation. -static unsigned NumDevirtCalls = 0; - // A slot in a set of virtual tables. The TypeID identifies the set of virtual // tables, and the ByteOffset is the offset in bytes from the address point to // the virtual function pointer. @@ -636,9 +628,11 @@ struct DevirtModule { std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; PatternList FunctionsToSkip; + const bool DevirtSpeculatively; DevirtModule(Module &M, ModuleAnalysisManager &MAM, ModuleSummaryIndex *ExportSummary, - const ModuleSummaryIndex *ImportSummary) + const ModuleSummaryIndex *ImportSummary, + bool DevirtSpeculatively) : M(M), MAM(MAM), FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), ExportSummary(ExportSummary), ImportSummary(ImportSummary), @@ -651,7 +645,8 @@ struct DevirtModule { RemarksEnabled(areRemarksEnabled()), OREGetter([&](Function &F) -> OptimizationRemarkEmitter & { return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - }) { + }), + DevirtSpeculatively(DevirtSpeculatively) { assert(!(ExportSummary && ImportSummary)); FunctionsToSkip.init(SkipFunctionNames); } @@ -765,7 +760,8 @@ struct DevirtModule { // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. - static bool runForTesting(Module &M, ModuleAnalysisManager &MAM); + static bool runForTesting(Module &M, ModuleAnalysisManager &MAM, + bool DevirtSpeculatively); }; struct DevirtIndex { @@ -808,11 +804,22 @@ struct DevirtIndex { PreservedAnalyses WholeProgramDevirtPass::run(Module &M, ModuleAnalysisManager &MAM) { if (UseCommandLine) { - if (!DevirtModule::runForTesting(M, MAM)) + if (!DevirtModule::runForTesting(M, MAM, ClDevirtualizeSpeculatively)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } - if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run()) + + std::optional<ModuleSummaryIndex> Index; + if (!ExportSummary && !ImportSummary && DevirtSpeculatively) { + // Build the ExportSummary from the module. + assert(!ExportSummary && + "ExportSummary is expected to be empty in non-LTO mode"); + ProfileSummaryInfo PSI(M); + Index.emplace(buildModuleSummaryIndex(M, nullptr, &PSI)); + ExportSummary = Index.has_value() ? &Index.value() : nullptr; + } + if (!DevirtModule(M, MAM, ExportSummary, ImportSummary, DevirtSpeculatively) + .run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -1010,7 +1017,8 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) { return ErrorSuccess(); } -bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) { +bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM, + bool DevirtSpeculatively) { std::unique_ptr<ModuleSummaryIndex> Summary = std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false); @@ -1039,7 +1047,8 @@ bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) { ClSummaryAction == PassSummaryAction::Export ? Summary.get() : nullptr, ClSummaryAction == PassSummaryAction::Import ? Summary.get() - : nullptr) + : nullptr, + DevirtSpeculatively) .run(); if (!ClWriteSummary.empty()) { @@ -1103,10 +1112,10 @@ bool DevirtModule::tryFindVirtualCallTargets( if (!TM.Bits->GV->isConstant()) return false; - // Without ClDevirtualizeSpeculatively, we cannot perform whole program + // Without DevirtSpeculatively, we cannot perform whole program // devirtualization analysis on a vtable with public LTO visibility. - if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() == - GlobalObject::VCallVisibilityPublic) + if (!DevirtSpeculatively && TM.Bits->GV->getVCallVisibility() == + GlobalObject::VCallVisibilityPublic) return false; Function *Fn = nullptr; @@ -1127,7 +1136,7 @@ bool DevirtModule::tryFindVirtualCallTargets( // In most cases empty functions will be overridden by the // implementation of the derived class, so we can skip them. - if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() && + if (DevirtSpeculatively && Fn->getReturnType()->isVoidTy() && Fn->getInstructionCount() <= 1) continue; @@ -1216,15 +1225,13 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, continue; // Stop when the number of devirted calls reaches the cutoff. - if (WholeProgramDevirtCutoff.getNumOccurrences() > 0 && - NumDevirtCalls >= WholeProgramDevirtCutoff) - return; + if (!DebugCounter::shouldExecute(CallsToDevirt)) + continue; if (RemarksEnabled) VCallSite.emitRemark("single-impl", TheFn->stripPointerCasts()->getName(), OREGetter); NumSingleImpl++; - NumDevirtCalls++; auto &CB = VCallSite.CB; assert(!CB.getCalledFunction() && "devirtualizing direct call?"); IRBuilder<> Builder(&CB); @@ -1250,8 +1257,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, // add support to compare the virtual function pointer to the // devirtualized target. In case of a mismatch, fall back to indirect // call. - if (DevirtCheckMode == WPDCheckMode::Fallback || - ClDevirtualizeSpeculatively) { + if (DevirtCheckMode == WPDCheckMode::Fallback || DevirtSpeculatively) { MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights(); // Version the indirect call site. If the called value is equal to the // given callee, 'NewInst' will be executed, otherwise the original call @@ -2375,7 +2381,7 @@ bool DevirtModule::run() { Function *PublicTypeTestFunc = nullptr; // If we are in speculative devirtualization mode, we can work on the public // type test intrinsics. - if (ClDevirtualizeSpeculatively) + if (DevirtSpeculatively) PublicTypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); Function *TypeTestFunc = @@ -2511,7 +2517,7 @@ bool DevirtModule::run() { // Out of speculative devirtualization mode, Try to apply virtual constant // propagation or branch funneling. // TODO: This should eventually be enabled for non-public type tests. - if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) { + if (!SingleImplDevirt && !DevirtSpeculatively) { DidVirtualConstProp |= tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first); |
