aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/IPO
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/IPO')
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp84
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp7
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp299
-rw-r--r--llvm/lib/Transforms/IPO/IROutliner.cpp9
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp17
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp266
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp42
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfile.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp66
10 files changed, 604 insertions, 190 deletions
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5048561..074d8ed 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final
return true;
RQITy StackRQI(A, From, To, ExclusionSet, false);
- typename RQITy::Reachable Result;
+ RQITy::Reachable Result;
if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
return NonConstThis->isReachableImpl(A, StackRQI,
/*IsTemporaryRQI=*/true);
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
// ------------------------ Align Argument Attribute ------------------------
namespace {
+
static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
Value &AssociatedValue, const Use *U,
const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,35 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
TrackUse = true;
return 0;
}
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::ptrmask: {
+ // Is it appropriate to pull attribute in initialization?
+ const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+ QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+ const auto *AlignAA = A.getAAFor<AAAlign>(
+ QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+ if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+ unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+ Value::MaxAlignmentExponent);
+ Align ConstAlign(UINT64_C(1) << ShiftValue);
+ if (ConstAlign >= AlignAA->getKnownAlign())
+ return Align(1).value();
+ }
+ if (AlignAA)
+ return AlignAA->getKnownAlign().value();
+ break;
+ }
+ case Intrinsic::amdgcn_make_buffer_rsrc: {
+ const auto *AlignAA = A.getAAFor<AAAlign>(
+ QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+ if (AlignAA)
+ return AlignAA->getKnownAlign().value();
+ break;
+ }
+ default:
+ break;
+ }
MaybeAlign MA;
if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5529,56 @@ struct AAAlignCallSiteReturned final
AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
: Base(IRP, A) {}
+ ChangeStatus updateImpl(Attributor &A) override {
+ Instruction *I = getIRPosition().getCtxI();
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::ptrmask: {
+ Align Alignment;
+ bool Valid = false;
+
+ const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+ *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+ if (ConstVals && ConstVals->isValidState()) {
+ unsigned ShiftValue =
+ std::min(ConstVals->getAssumedMinTrailingZeros(),
+ Value::MaxAlignmentExponent);
+ Alignment = Align(UINT64_C(1) << ShiftValue);
+ Valid = true;
+ }
+
+ const auto *AlignAA =
+ A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+ DepClassTy::REQUIRED);
+ if (AlignAA) {
+ Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+ Valid = true;
+ }
+
+ if (Valid)
+ return clampStateAndIndicateChange<StateType>(
+ this->getState(),
+ std::min(this->getAssumedAlign(), Alignment).value());
+ break;
+ }
+ // FIXME: Should introduce target specific sub-attributes and letting
+ // getAAfor<AAAlign> lead to create sub-attribute to handle target
+ // specific intrinsics.
+ case Intrinsic::amdgcn_make_buffer_rsrc: {
+ const auto *AlignAA =
+ A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+ DepClassTy::REQUIRED);
+ if (AlignAA)
+ return clampStateAndIndicateChange<StateType>(
+ this->getState(), AlignAA->getAssumedAlign().value());
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ return Base::updateImpl(A);
+ };
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
};
@@ -10701,7 +10781,7 @@ struct AAInterFnReachabilityFunction
auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this);
RQITy StackRQI(A, From, To, ExclusionSet, false);
- typename RQITy::Reachable Result;
+ RQITy::Reachable Result;
if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
return NonConstThis->isReachableImpl(A, StackRQI,
/*IsTemporaryRQI=*/true);
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 6a11aec..4863d6b 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -422,6 +422,13 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
assert(VariadicWrapperDefine == VariadicWrapper);
assert(!VariadicWrapper->isDeclaration());
+ // Add the prof metadata from the original function to the wrapper. Because
+ // FixedArityReplacement is the owner of original function's prof metadata
+ // after the splice, we need to transfer it to VariadicWrapper.
+ VariadicWrapper->setMetadata(
+ LLVMContext::MD_prof,
+ FixedArityReplacement->getMetadata(LLVMContext::MD_prof));
+
// We now have:
// 1. the original function, now as a declaration with no uses
// 2. a variadic function that unconditionally calls a fixed arity replacement
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 99c4982..66b95fc 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -99,6 +99,11 @@ static cl::opt<bool>
"functions from non-versioned callers."),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> MaxIFuncVersions(
+ "max-ifunc-versions", cl::Hidden, cl::init(5),
+ cl::desc("Maximum number of caller/callee versions that is allowed for "
+ "using the expensive (cubic) static resolution algorithm."));
+
static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
cl::desc("Enable stress test of coldcc by adding "
@@ -2018,12 +2023,15 @@ OptimizeFunctions(Module &M,
if (hasChangeableCC(&F, ChangeableCCCache)) {
// If this function has a calling convention worth changing, is not a
- // varargs function, and is only called directly, promote it to use the
- // Fast calling convention.
- F.setCallingConv(CallingConv::Fast);
- ChangeCalleesToFastCall(&F);
- ++NumFastCallFns;
- Changed = true;
+ // varargs function, is only called directly, and is supported by the
+ // target, promote it to use the Fast calling convention.
+ TargetTransformInfo &TTI = GetTTI(F);
+ if (TTI.useFastCCForInternalCall(F)) {
+ F.setCallingConv(CallingConv::Fast);
+ ChangeCalleesToFastCall(&F);
+ ++NumFastCallFns;
+ Changed = true;
+ }
}
if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) &&
@@ -2488,20 +2496,21 @@ DeleteDeadIFuncs(Module &M,
// Follows the use-def chain of \p V backwards until it finds a Function,
// in which case it collects in \p Versions. Return true on successful
// use-def chain traversal, false otherwise.
-static bool collectVersions(TargetTransformInfo &TTI, Value *V,
- SmallVectorImpl<Function *> &Versions) {
+static bool
+collectVersions(Value *V, SmallVectorImpl<Function *> &Versions,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI) {
if (auto *F = dyn_cast<Function>(V)) {
- if (!TTI.isMultiversionedFunction(*F))
+ if (!GetTTI(*F).isMultiversionedFunction(*F))
return false;
Versions.push_back(F);
} else if (auto *Sel = dyn_cast<SelectInst>(V)) {
- if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+ if (!collectVersions(Sel->getTrueValue(), Versions, GetTTI))
return false;
- if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+ if (!collectVersions(Sel->getFalseValue(), Versions, GetTTI))
return false;
} else if (auto *Phi = dyn_cast<PHINode>(V)) {
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
- if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+ if (!collectVersions(Phi->getIncomingValue(I), Versions, GetTTI))
return false;
} else {
// Unknown instruction type. Bail.
@@ -2510,31 +2519,45 @@ static bool collectVersions(TargetTransformInfo &TTI, Value *V,
return true;
}
-// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
-// deduce whether the optimization is legal we need to compare the target
-// features between caller and callee versions. The criteria for bypassing
-// the resolver are the following:
-//
-// * If the callee's feature set is a subset of the caller's feature set,
-// then the callee is a candidate for direct call.
-//
-// * Among such candidates the one of highest priority is the best match
-// and it shall be picked, unless there is a version of the callee with
-// higher priority than the best match which cannot be picked from a
-// higher priority caller (directly or through the resolver).
-//
-// * For every higher priority callee version than the best match, there
-// is a higher priority caller version whose feature set availability
-// is implied by the callee's feature set.
+// Try to statically resolve calls to versioned functions when possible. First
+// we identify the function versions which are associated with an IFUNC symbol.
+// We do that by examining the resolver function of the IFUNC. Once we have
+// collected all the function versions, we sort them in decreasing priority
+// order. This is necessary for determining the most suitable callee version
+// for each caller version. We then collect all the callsites to versioned
+// functions. The static resolution is performed by comparing the feature sets
+// between callers and callees. Specifically:
+// * Start a walk over caller and callee lists simultaneously in order of
+// decreasing priority.
+// * Statically resolve calls from the current caller to the current callee,
+// iff the caller feature bits are a superset of the callee feature bits.
+// * For FMV callers, as long as the caller feature bits are a subset of the
+// callee feature bits, advance to the next callee. This effectively prevents
+// considering the current callee as a candidate for static resolution by
+// following callers (explanation: preceding callers would not have been
+// selected in a hypothetical runtime execution).
+// * Advance to the next caller.
//
+// Presentation in EuroLLVM2025:
+// https://www.youtube.com/watch?v=k54MFimPz-A&t=867s
static bool OptimizeNonTrivialIFuncs(
Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
bool Changed = false;
- // Cache containing the mask constructed from a function's target features.
+ // Map containing the feature bits for a given function.
DenseMap<Function *, APInt> FeatureMask;
+ // Map containing the priority bits for a given function.
+ DenseMap<Function *, APInt> PriorityMask;
+ // Map containing all the function versions corresponding to an IFunc symbol.
+ DenseMap<GlobalIFunc *, SmallVector<Function *>> VersionedFuncs;
+ // Map containing the IFunc symbol a function is version of.
+ DenseMap<Function *, GlobalIFunc *> VersionOf;
+ // List of all the interesting IFuncs found in the module.
+ SmallVector<GlobalIFunc *> IFuncs;
for (GlobalIFunc &IF : M.ifuncs()) {
+ LLVM_DEBUG(dbgs() << "Examining IFUNC " << IF.getName() << "\n");
+
if (IF.isInterposable())
continue;
@@ -2545,107 +2568,175 @@ static bool OptimizeNonTrivialIFuncs(
if (Resolver->isInterposable())
continue;
- TargetTransformInfo &TTI = GetTTI(*Resolver);
-
- // Discover the callee versions.
- SmallVector<Function *> Callees;
- if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+ SmallVector<Function *> Versions;
+ // Discover the versioned functions.
+ if (any_of(*Resolver, [&](BasicBlock &BB) {
if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
- if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+ if (!collectVersions(Ret->getReturnValue(), Versions, GetTTI))
return true;
return false;
}))
continue;
- if (Callees.empty())
+ if (Versions.empty())
continue;
- LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
- << Resolver->getName() << "\n");
-
- // Cache the feature mask for each callee.
- for (Function *Callee : Callees) {
- auto [It, Inserted] = FeatureMask.try_emplace(Callee);
- if (Inserted)
- It->second = TTI.getFeatureMask(*Callee);
+ for (Function *V : Versions) {
+ VersionOf.insert({V, &IF});
+ auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(V);
+ if (FeatInserted)
+ FeatIt->second = GetTTI(*V).getFeatureMask(*V);
+ auto [PriorIt, PriorInserted] = PriorityMask.try_emplace(V);
+ if (PriorInserted)
+ PriorIt->second = GetTTI(*V).getPriorityMask(*V);
}
- // Sort the callee versions in decreasing priority order.
- sort(Callees, [&](auto *LHS, auto *RHS) {
- return FeatureMask[LHS].ugt(FeatureMask[RHS]);
+ // Sort function versions in decreasing priority order.
+ sort(Versions, [&](auto *LHS, auto *RHS) {
+ return PriorityMask[LHS].ugt(PriorityMask[RHS]);
});
- // Find the callsites and cache the feature mask for each caller.
- SmallVector<Function *> Callers;
+ IFuncs.push_back(&IF);
+ VersionedFuncs.try_emplace(&IF, std::move(Versions));
+ }
+
+ for (GlobalIFunc *CalleeIF : IFuncs) {
+ SmallVector<Function *> NonFMVCallers;
+ DenseSet<GlobalIFunc *> CallerIFuncs;
DenseMap<Function *, SmallVector<CallBase *>> CallSites;
- for (User *U : IF.users()) {
+
+ // Find the callsites.
+ for (User *U : CalleeIF->users()) {
if (auto *CB = dyn_cast<CallBase>(U)) {
- if (CB->getCalledOperand() == &IF) {
+ if (CB->getCalledOperand() == CalleeIF) {
Function *Caller = CB->getFunction();
- auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
- if (FeatInserted)
- FeatIt->second = TTI.getFeatureMask(*Caller);
- auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
- if (CallInserted)
- Callers.push_back(Caller);
- CallIt->second.push_back(CB);
+ GlobalIFunc *CallerIF = nullptr;
+ TargetTransformInfo &TTI = GetTTI(*Caller);
+ bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller);
+ // The caller is a version of a known IFunc.
+ if (auto It = VersionOf.find(Caller); It != VersionOf.end())
+ CallerIF = It->second;
+ else if (!CallerIsFMV && OptimizeNonFMVCallers) {
+ // The caller is non-FMV.
+ auto [It, Inserted] = FeatureMask.try_emplace(Caller);
+ if (Inserted)
+ It->second = TTI.getFeatureMask(*Caller);
+ } else
+ // The caller is none of the above, skip.
+ continue;
+ auto [It, Inserted] = CallSites.try_emplace(Caller);
+ if (Inserted) {
+ if (CallerIsFMV)
+ CallerIFuncs.insert(CallerIF);
+ else
+ NonFMVCallers.push_back(Caller);
+ }
+ It->second.push_back(CB);
}
}
}
- // Sort the caller versions in decreasing priority order.
- sort(Callers, [&](auto *LHS, auto *RHS) {
- return FeatureMask[LHS].ugt(FeatureMask[RHS]);
- });
+ if (CallSites.empty())
+ continue;
- auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); };
-
- // Index to the highest priority candidate.
- unsigned I = 0;
- // Now try to redirect calls starting from higher priority callers.
- for (Function *Caller : Callers) {
- assert(I < Callees.size() && "Found callers of equal priority");
-
- Function *Callee = Callees[I];
- APInt CallerBits = FeatureMask[Caller];
- APInt CalleeBits = FeatureMask[Callee];
-
- // In the case of FMV callers, we know that all higher priority callers
- // than the current one did not get selected at runtime, which helps
- // reason about the callees (if they have versions that mandate presence
- // of the features which we already know are unavailable on this target).
- if (TTI.isMultiversionedFunction(*Caller)) {
- // If the feature set of the caller implies the feature set of the
- // highest priority candidate then it shall be picked. In case of
- // identical sets advance the candidate index one position.
- if (CallerBits == CalleeBits)
- ++I;
- else if (!implies(CallerBits, CalleeBits)) {
- // Keep advancing the candidate index as long as the caller's
- // features are a subset of the current candidate's.
- while (implies(CalleeBits, CallerBits)) {
- if (++I == Callees.size())
- break;
- CalleeBits = FeatureMask[Callees[I]];
+ LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
+ << CalleeIF->getResolverFunction()->getName() << "\n");
+
+ // The complexity of this algorithm is linear: O(NumCallers + NumCallees)
+ // if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions,
+ // otherwise it is cubic: O((NumCallers ^ 2) x NumCallees).
+ auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
+ ArrayRef<Function *> Callees,
+ bool CallerIsFMV) {
+ bool AllowExpensiveChecks = CallerIsFMV &&
+ Callers.size() <= MaxIFuncVersions &&
+ Callees.size() <= MaxIFuncVersions;
+ // Index to the highest callee candidate.
+ unsigned J = 0;
+
+ for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
+ // There are no callee candidates left.
+ if (J == Callees.size())
+ break;
+
+ Function *Caller = Callers[I];
+ APInt CallerBits = FeatureMask[Caller];
+
+ // Compare the feature bits of the best callee candidate with all the
+ // caller versions preceeding the current one. For each prior caller
+ // discard feature bits that are known to be available in the current
+ // caller. As long as the known missing feature bits are a subset of the
+ // callee feature bits, advance to the next callee and start over.
+ auto eliminateAvailableFeatures = [&](unsigned BestCandidate) {
+ unsigned K = 0;
+ while (K < I && BestCandidate < Callees.size()) {
+ APInt MissingBits = FeatureMask[Callers[K]] & ~CallerBits;
+ if (MissingBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
+ ++BestCandidate;
+ // Start over.
+ K = 0;
+ } else
+ ++K;
}
+ return BestCandidate;
+ };
+
+ unsigned BestCandidate =
+ AllowExpensiveChecks ? eliminateAvailableFeatures(J) : J;
+ // No callee candidate was found for this caller.
+ if (BestCandidate == Callees.size())
continue;
+
+ LLVM_DEBUG(dbgs() << " Examining "
+ << (CallerIsFMV ? "FMV" : "regular") << " caller "
+ << Caller->getName() << "\n");
+
+ Function *Callee = Callees[BestCandidate];
+ APInt CalleeBits = FeatureMask[Callee];
+
+ // Statically resolve calls from the current caller to the current
+ // callee, iff the caller feature bits are a superset of the callee
+ // feature bits.
+ if (CalleeBits.isSubsetOf(CallerBits)) {
+ // Not all caller versions are necessarily users of the callee IFUNC.
+ if (auto It = CallSites.find(Caller); It != CallSites.end()) {
+ for (CallBase *CS : It->second) {
+ LLVM_DEBUG(dbgs() << " Redirecting call " << Caller->getName()
+ << " -> " << Callee->getName() << "\n");
+ CS->setCalledOperand(Callee);
+ }
+ Changed = true;
+ }
}
- } else {
- // We can't reason much about non-FMV callers. Just pick the highest
- // priority callee if it matches, otherwise bail.
- if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits))
+
+ // Nothing else to do about non-FMV callers.
+ if (!CallerIsFMV)
continue;
+
+ // For FMV callers, as long as the caller feature bits are a subset of
+ // the callee feature bits, advance to the next callee. This effectively
+ // prevents considering the current callee as a candidate for static
+ // resolution by following callers.
+ while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) &&
+ ++J < Callees.size())
+ ;
}
- auto &Calls = CallSites[Caller];
- for (CallBase *CS : Calls) {
- LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> "
- << Callee->getName() << "\n");
- CS->setCalledOperand(Callee);
- }
- Changed = true;
+ };
+
+ auto &Callees = VersionedFuncs[CalleeIF];
+
+ // Optimize non-FMV calls.
+ if (OptimizeNonFMVCallers)
+ staticallyResolveCalls(NonFMVCallers, Callees, /*CallerIsFMV=*/false);
+
+ // Optimize FMV calls.
+ for (GlobalIFunc *CallerIF : CallerIFuncs) {
+ auto &Callers = VersionedFuncs[CallerIF];
+ staticallyResolveCalls(Callers, Callees, /*CallerIsFMV=*/true);
}
- if (IF.use_empty() ||
- all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+
+ if (CalleeIF->use_empty() ||
+ all_of(CalleeIF->users(), [](User *U) { return isa<GlobalAlias>(U); }))
NumIFuncsResolved++;
}
return Changed;
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index fdf0c3a..6e1ca9c 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2108,8 +2108,7 @@ static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
for (Value *RetVal : SortedKeys) {
BasicBlock *NewBB = BasicBlock::Create(
- ParentFunc->getContext(),
- Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+ ParentFunc->getContext(), Twine(BaseName) + Twine("_") + Twine(Idx++),
ParentFunc);
NewMap.insert(std::make_pair(RetVal, NewBB));
}
@@ -2286,9 +2285,9 @@ void IROutliner::deduplicateExtractedSections(
// Create a set of BasicBlocks, one for each return block, to hold the
// needed store instructions.
DenseMap<Value *, BasicBlock *> NewBBs;
- createAndInsertBasicBlocks(
- CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
- "output_block_" + Twine(static_cast<unsigned>(Idx)));
+ createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs,
+ CurrentGroup.OutlinedFunction,
+ "output_block_" + Twine(Idx));
replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
CurrentGroup.EndBBs, OutputMappings,
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index aa1346d..06deea8 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -48,12 +48,14 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/ModuleSummaryIndexYAML.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
@@ -78,7 +80,6 @@
#include <algorithm>
#include <cassert>
#include <cstdint>
-#include <memory>
#include <set>
#include <string>
#include <system_error>
@@ -803,7 +804,9 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
return createBitSetTest(ThenB, TIL, BitOffset);
}
- IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
+ MDBuilder MDB(M.getContext());
+ IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false,
+ MDB.createLikelyBranchWeights()));
// Now that we know that the offset is in range and aligned, load the
// appropriate bit from the bitset.
@@ -1470,6 +1473,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
Constant::getNullValue(F->getType()));
Value *Select = Builder.CreateSelect(ICmp, JT,
Constant::getNullValue(F->getType()));
+
+ if (auto *SI = dyn_cast<SelectInst>(Select))
+ setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
// For phi nodes, we need to update the incoming value for all operands
// with the same predecessor.
if (PN)
@@ -1552,12 +1558,7 @@ void LowerTypeTestsModule::createJumpTable(
// Align the whole table by entry size.
F->setAlignment(Align(getJumpTableEntrySize(JumpTableArch)));
- // Skip prologue.
- // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
- // Luckily, this function does not get any prologue even without the
- // attribute.
- if (OS != Triple::Win32)
- F->addFnAttr(Attribute::Naked);
+ F->addFnAttr(Attribute::Naked);
if (JumpTableArch == Triple::arm)
F->addFnAttr("target-features", "-thumb-mode");
if (JumpTableArch == Triple::thumb) {
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 894d83f..0f4bc64 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;
+cl::opt<unsigned> MemProfTopNImportant(
+ "memprof-top-n-important", cl::init(10), cl::Hidden,
+ cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+ "memprof-fixup-important", cl::init(true), cl::Hidden,
+ cl::desc("Enables edge fixup for important contexts"));
+
} // namespace llvm
namespace {
+
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
/// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ protected:
/// Adds nodes for the given MIB stack ids.
template <class NodeT, class IteratorT>
- void addStackNodesForMIB(ContextNode *AllocNode,
- CallStack<NodeT, IteratorT> &StackContext,
- CallStack<NodeT, IteratorT> &CallsiteContext,
- AllocationType AllocType,
- ArrayRef<ContextTotalSize> ContextSizeInfo);
+ void addStackNodesForMIB(
+ ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+ CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+ ArrayRef<ContextTotalSize> ContextSizeInfo,
+ std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
/// Matches all callsite metadata (or summary) to the nodes created for
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
/// inlining performed on those callsite instructions.
void updateStackNodes();
+ /// Optionally fixup edges for the N largest cold contexts to better enable
+ /// cloning. This is particularly helpful if the context includes recursion
+ /// as well as inlining, resulting in a single stack node for multiple stack
+ /// ids in the context. With recursion it is particularly difficult to get the
+ /// edge updates correct as in the general case we have lost the original
+ /// stack id ordering for the context. Do more expensive fixup for the largest
+ /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+ void fixupImportantContexts();
+
/// Update graph to conservatively handle any callsite stack nodes that target
/// multiple different callee target functions.
void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ private:
void assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
- DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+ DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+ const DenseSet<uint32_t> &ImportantContextIds);
/// Duplicates the given set of context ids, updating the provided
/// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ private:
/// nodes.
DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
+ /// Saves information for the contexts identified as important (the largest
+ /// cold contexts up to MemProfTopNImportant).
+ struct ImportantContextInfo {
+ // The original list of leaf first stack ids corresponding to this context.
+ std::vector<uint64_t> StackIds;
+ // Max length of stack ids corresponding to a single stack ContextNode for
+ // this context (i.e. the max length of a key in StackIdsToNode below).
+ unsigned MaxLength = 0;
+ // Mapping of slices of the stack ids to the corresponding ContextNode
+ // (there can be multiple stack ids due to inlining). Populated when
+ // updating stack nodes while matching them to the IR or summary.
+ std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+ };
+
+ // Map of important full context ids to information about each.
+ DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+ // For each important context id found in Node (if any), records the list of
+ // stack ids that corresponded to the given callsite Node. There can be more
+ // than one in the case of inlining.
+ void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+ // We pass in the Node's context ids to avoid the
+ // overhead of computing them as the caller already has
+ // them in some cases.
+ const DenseSet<uint32_t> &NodeContextIds,
+ const DenseSet<uint32_t> &ImportantContextIds) {
+ if (!MemProfTopNImportant) {
+ assert(ImportantContextIds.empty());
+ return;
+ }
+ DenseSet<uint32_t> Ids =
+ set_intersection(NodeContextIds, ImportantContextIds);
+ if (Ids.empty())
+ return;
+ auto Size = StackIds.size();
+ for (auto Id : Ids) {
+ auto &Entry = ImportantContextIdInfo[Id];
+ Entry.StackIdsToNode[StackIds] = Node;
+ // Keep track of the max to simplify later analysis.
+ if (Size > Entry.MaxLength)
+ Entry.MaxLength = Size;
+ }
+ }
+
/// Maps to track the calls to their corresponding nodes in the graph.
MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1034,11 +1101,11 @@ private:
} // namespace
template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
template <>
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
- ArrayRef<ContextTotalSize> ContextSizeInfo) {
+ ArrayRef<ContextTotalSize> ContextSizeInfo,
+ std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
// is done.
if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextIdToAllocationType[++LastContextId] = AllocType;
+ bool IsImportant = false;
if (!ContextSizeInfo.empty()) {
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+ // If this is a cold allocation, and we are collecting non-zero largest
+ // contexts, see if this is a candidate.
+ if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+ uint64_t TotalCold = 0;
+ for (auto &CSI : ContextSizeInfo)
+ TotalCold += CSI.TotalSize;
+ // Record this context if either we haven't found the first top-n largest
+ // yet, or if it is larger than the smallest already recorded.
+ if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+ // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+ // sorted in ascending size of its key which is the size.
+ TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+ if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+ // Remove old one and its associated entries.
+ auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+ TotalSizeToContextIdTopNCold.erase(
+ TotalSizeToContextIdTopNCold.begin());
+ assert(ImportantContextIdInfo.count(IdToRemove));
+ ImportantContextIdInfo.erase(IdToRemove);
+ }
+ TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+ IsImportant = true;
+ }
+ }
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
}
@@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
ContextIter != StackContext.end(); ++ContextIter) {
auto StackId = getStackId(*ContextIter);
+ if (IsImportant)
+ ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
ContextNode *StackNode = getNodeForStackId(StackId);
if (!StackNode) {
StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
- assignStackNodesPostOrder(
- ContextNode *Node, DenseSet<const ContextNode *> &Visited,
- DenseMap<uint64_t, std::vector<CallContextInfo>>
- &StackIdToMatchingCalls,
- DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+ assignStackNodesPostOrder(ContextNode *Node,
+ DenseSet<const ContextNode *> &Visited,
+ DenseMap<uint64_t, std::vector<CallContextInfo>>
+ &StackIdToMatchingCalls,
+ DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+ const DenseSet<uint32_t> &ImportantContextIds) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
@@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
continue;
}
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
- CallToMatchingCall);
+ CallToMatchingCall, ImportantContextIds);
}
// If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
Node->setCall(Call);
NonAllocationCallToContextNodeMap[Call] = Node;
NodeToCallingFunc[Node] = Func;
+ recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
return;
}
}
@@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
: CurNode->computeAllocType();
PrevNode = CurNode;
}
+
+ recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
if (VerifyNodes) {
checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
for (auto Id : Ids) {
@@ -1799,6 +1899,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+ CallTy>::fixupImportantContexts() {
+ if (ImportantContextIdInfo.empty())
+ return;
+
+ // Update statistics as we are done building this map at this point.
+ NumImportantContextIds = ImportantContextIdInfo.size();
+
+ if (!MemProfFixupImportant)
+ return;
+
+ if (ExportToDot)
+ exportToDot("beforestackfixup");
+
+ // For each context we identified as important, walk through the saved context
+ // stack ids in order from leaf upwards, and make sure all edges are correct.
+ // These can be difficult to get right when updating the graph while mapping
+ // nodes onto summary or IR, especially when there is recursion. In
+ // particular, when we have created new nodes to reflect inlining, it is
+ // sometimes impossible to know exactly how to update the edges in the face of
+ // recursion, as we have lost the original ordering of the stack ids in the
+ // contexts.
+ // TODO: Consider only doing this if we detect the context has recursive
+ // cycles.
+ //
+ // I.e. assume we have a context with stack ids like: {A B A C A D E}
+ // and let's say A was inlined into B, C, and D. The original graph will have
+ // multiple recursive cycles through A. When we match the original context
+ // nodes onto the IR or summary, we will merge {A B} into one context node,
+ // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+ // above, we should end up with a non-cyclic set of edges like:
+ // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+ // original ordering, we won't get the edges correct initially (it's
+ // impossible without the original ordering). Here we do the fixup (add and
+ // removing edges where necessary) for this context. In the
+ // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+ // and map entries for {A B}, {A C}, {A D}, and {E}.
+ for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
+ if (Info.StackIdsToNode.empty())
+ continue;
+ bool Changed = false;
+ ContextNode *PrevNode = nullptr;
+ ContextNode *CurNode = nullptr;
+ DenseSet<const ContextEdge *> VisitedEdges;
+ ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+ // Try to identify what callsite ContextNode maps to which slice of the
+ // context's ordered stack ids.
+ for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+ // We will do this greedily, trying up to MaxLength stack ids in a row, to
+ // see if we recorded a context node for that sequence.
+ auto Len = Info.MaxLength;
+ auto LenToEnd = AllStackIds.size() - I;
+ if (Len > LenToEnd)
+ Len = LenToEnd;
+ CurNode = nullptr;
+ // Try to find a recorded context node starting with the longest length
+ // recorded, and on down until we check for just a single stack node.
+ for (; Len > 0; Len--) {
+ // Get the slice of the original stack id sequence to check.
+ auto CheckStackIds = AllStackIds.slice(I, Len);
+ auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+ if (EntryIt == Info.StackIdsToNode.end())
+ continue;
+ CurNode = EntryIt->second;
+ // Skip forward so we don't try to look for the ones we just matched.
+ // We increment by Len - 1, because the outer for loop will increment I.
+ I += Len - 1;
+ break;
+ }
+ // Give up if we couldn't find a node. Since we need to clone from the
+ // leaf allocation upwards, no sense in doing anymore fixup further up
+ // the context if we couldn't match part of the original stack context
+ // onto a callsite node.
+ if (!CurNode)
+ break;
+ // No edges to fix up until we have a pair of nodes that should be
+ // adjacent in the graph.
+ if (!PrevNode)
+ continue;
+ // See if we already have a call edge from CurNode to PrevNode.
+ auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+ if (CurEdge) {
+ // We already have an edge. Make sure it contains this context id.
+ if (CurEdge->getContextIds().insert(CurContextId).second) {
+ NumFixupEdgeIdsInserted++;
+ Changed = true;
+ }
+ } else {
+ // No edge exists - add one.
+ NumFixupEdgesAdded++;
+ DenseSet<uint32_t> ContextIds({CurContextId});
+ auto AllocType = computeAllocType(ContextIds);
+ auto NewEdge = std::make_shared<ContextEdge>(
+ PrevNode, CurNode, AllocType, std::move(ContextIds));
+ PrevNode->CallerEdges.push_back(NewEdge);
+ CurNode->CalleeEdges.push_back(NewEdge);
+ // Save the new edge for the below handling.
+ CurEdge = NewEdge.get();
+ Changed = true;
+ }
+ VisitedEdges.insert(CurEdge);
+ // Now remove this context id from any other caller edges calling
+ // PrevNode.
+ for (auto &Edge : PrevNode->CallerEdges) {
+ // Skip the edge updating/created above and edges we have already
+ // visited (due to recursion).
+ if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+ Edge->getContextIds().erase(CurContextId);
+ }
+ }
+ if (Changed)
+ NumFixedContexts++;
+ }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// Map of stack id to all calls with that as the last (outermost caller)
// callsite id that has a context node (some might not due to pruning
@@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// nodes representing any inlining at interior callsites. Note we move the
// associated context ids over to the new nodes.
DenseSet<const ContextNode *> Visited;
+ DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+ ImportantContextIdInfo.keys());
for (auto &Entry : AllocationCallToContextNodeMap)
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
- CallToMatchingCall);
+ CallToMatchingCall, ImportantContextIds);
+
+ fixupImportantContexts();
+
if (VerifyCCG)
check();
}
@@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
: Mod(M), OREGetter(OREGetter) {
+ // Map for keeping track of the largest cold contexts up to the number given
+ // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+ // must be sorted.
+ std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
for (auto &F : M) {
std::vector<CallInfo> CallsWithMetadata;
for (auto &BB : F) {
@@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
AllocNode, StackContext, CallsiteContext,
- getMIBAllocType(MIBMD), ContextSizeInfo);
+ getMIBAllocType(MIBMD), ContextSizeInfo,
+ TotalSizeToContextIdTopNCold);
}
// If exporting the graph to dot and an allocation id of interest was
// specified, record all the context ids for this allocation node.
@@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing)
: Index(Index), isPrevailing(isPrevailing) {
+ // Map for keeping track of the largest cold contexts up to the number given
+ // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+ // must be sorted.
+ std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
for (auto &I : Index) {
auto VI = Index.getValueInfo(I);
for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
}
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
AllocNode, StackContext, EmptyContext, MIB.AllocType,
- ContextSizeInfo);
+ ContextSizeInfo, TotalSizeToContextIdTopNCold);
I++;
}
// If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index d7eb745..1f52f9a 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -208,7 +208,7 @@ namespace KernelInfo {
// };
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
- constexpr const unsigned MEMBER##Idx = IDX;
+ constexpr unsigned MEMBER##Idx = IDX;
KERNEL_ENVIRONMENT_IDX(Configuration, 0)
KERNEL_ENVIRONMENT_IDX(Ident, 1)
@@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
#undef KERNEL_ENVIRONMENT_IDX
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
- constexpr const unsigned MEMBER##Idx = IDX;
+ constexpr unsigned MEMBER##Idx = IDX;
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
@@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
GlobalVariable *
getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
- constexpr const int InitKernelEnvironmentArgNo = 0;
+ constexpr int InitKernelEnvironmentArgNo = 0;
return cast<GlobalVariable>(
KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
->stripPointerCasts());
@@ -730,7 +730,7 @@ struct KernelInfoState : AbstractState {
/// State to indicate if we can track parallel level of the associated
/// function. We will give up tracking if we encounter unknown caller or the
- /// caller is __kmpc_parallel_51.
+ /// caller is __kmpc_parallel_60.
BooleanStateWithSetVector<uint8_t> ParallelLevels;
/// Flag that indicates if the kernel has nested Parallelism
@@ -2117,8 +2117,8 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
return getUniqueKernelFor(*CB);
OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
- // Allow the use in __kmpc_parallel_51 calls.
+ OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
+ // Allow the use in __kmpc_parallel_60 calls.
if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
return getUniqueKernelFor(*CB);
return nullptr;
@@ -2145,7 +2145,7 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+ OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
bool Changed = false;
if (!KernelParallelRFI)
@@ -2157,7 +2157,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
for (Function *F : SCC) {
- // Check if the function is a use in a __kmpc_parallel_51 call at
+ // Check if the function is a use in a __kmpc_parallel_60 call at
// all.
bool UnknownUse = false;
bool KernelParallelUse = false;
@@ -2189,7 +2189,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
UnknownUse = true;
});
- // Do not emit a remark if we haven't seen a __kmpc_parallel_51
+ // Do not emit a remark if we haven't seen a __kmpc_parallel_60
// use.
if (!KernelParallelUse)
continue;
@@ -2208,7 +2208,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
continue;
}
- // Even if we have __kmpc_parallel_51 calls, we (for now) give
+ // Even if we have __kmpc_parallel_60 calls, we (for now) give
// up if the function is not called from a unique kernel.
Kernel K = getUniqueKernelFor(*F);
if (!K) {
@@ -4457,7 +4457,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
Instruction *IsWorker =
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
- ConstantInt::get(KernelInitCB->getType(), -1),
+ ConstantInt::getAllOnesValue(KernelInitCB->getType()),
"thread.is_worker", InitBB);
IsWorker->setDebugLoc(DLoc);
BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
@@ -4821,8 +4821,8 @@ private:
/// Update info regarding parallel levels.
void updateParallelLevels(Attributor &A) {
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+ OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
auto PredCallSite = [&](AbstractCallSite ACS) {
Function *Caller = ACS.getInstruction()->getFunction();
@@ -4832,12 +4832,12 @@ private:
auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
if (CAA && CAA->ParallelLevels.isValidState()) {
- // Any function that is called by `__kmpc_parallel_51` will not be
+ // Any function that is called by `__kmpc_parallel_60` will not be
// folded as the parallel level in the function is updated. In order to
// get it right, all the analysis would depend on the implentation. That
// said, if in the future any change to the implementation, the analysis
// could be wrong. As a consequence, we are just conservative here.
- if (Caller == Parallel51RFI.Declaration) {
+ if (Caller == Parallel60RFI.Declaration) {
ParallelLevels.indicatePessimisticFixpoint();
return true;
}
@@ -5006,8 +5006,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
case OMPRTL___kmpc_target_deinit:
KernelDeinitCB = &CB;
break;
- case OMPRTL___kmpc_parallel_51:
- if (!handleParallel51(A, CB))
+ case OMPRTL___kmpc_parallel_60:
+ if (!handleParallel60(A, CB))
indicatePessimisticFixpoint();
return;
case OMPRTL___kmpc_omp_task:
@@ -5075,8 +5075,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
return indicatePessimisticFixpoint();
CallBase &CB = cast<CallBase>(getAssociatedValue());
- if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
- if (!handleParallel51(A, CB))
+ if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
+ if (!handleParallel60(A, CB))
return indicatePessimisticFixpoint();
return StateBefore == getState() ? ChangeStatus::UNCHANGED
: ChangeStatus::CHANGED;
@@ -5136,9 +5136,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
: ChangeStatus::CHANGED;
}
- /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
+ /// Deal with a __kmpc_parallel_60 call (\p CB). Returns true if the call was
/// handled, if a problem occurred, false is returned.
- bool handleParallel51(Attributor &A, CallBase &CB) {
+ bool handleParallel60(Attributor &A, CallBase &CB) {
const unsigned int NonWrapperFunctionArgNo = 5;
const unsigned int WrapperFunctionArgNo = 6;
auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index e39e311..8e76b79 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -83,7 +83,6 @@
#include <cstdint>
#include <functional>
#include <limits>
-#include <map>
#include <memory>
#include <queue>
#include <string>
@@ -2293,7 +2292,6 @@ bool SampleProfileLoader::runOnFunction(Function &F,
// count value.
if (!F.getEntryCount())
F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
.getManager();
ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 70b8614..b9fb7a3 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -18,6 +18,8 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/LongestCommonSequence.h"
+#include <unordered_set>
+
using namespace llvm;
using namespace sampleprof;
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2dd0fde..7aa90ee 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -99,6 +99,7 @@
#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
@@ -130,6 +131,8 @@ STATISTIC(NumUniqueRetVal, "Number of unique return value optimizations");
STATISTIC(NumVirtConstProp1Bit,
"Number of 1 bit virtual constant propagations");
STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
+DEBUG_COUNTER(CallsToDevirt, "calls-to-devirt",
+ "Controls how many calls should be devirtualized.");
namespace llvm {
@@ -219,14 +222,6 @@ static cl::opt<bool> WholeProgramDevirtKeepUnreachableFunction(
cl::desc("Regard unreachable functions as possible devirtualize targets."),
cl::Hidden, cl::init(true));
-/// If explicitly specified, the devirt module pass will stop transformation
-/// once the total number of devirtualizations reach the cutoff value. Setting
-/// this option to 0 explicitly will do 0 devirtualization.
-static cl::opt<unsigned> WholeProgramDevirtCutoff(
- "wholeprogramdevirt-cutoff",
- cl::desc("Max number of devirtualizations for devirt module pass"),
- cl::init(0));
-
/// Mechanism to add runtime checking of devirtualization decisions, optionally
/// trapping or falling back to indirect call on any that are not correct.
/// Trapping mode is useful for debugging undefined behavior leading to failures
@@ -377,9 +372,6 @@ VirtualCallTarget::VirtualCallTarget(GlobalValue *Fn, const TypeMemberInfo *TM)
namespace {
-// Tracks the number of devirted calls in the IR transformation.
-static unsigned NumDevirtCalls = 0;
-
// A slot in a set of virtual tables. The TypeID identifies the set of virtual
// tables, and the ByteOffset is the offset in bytes from the address point to
// the virtual function pointer.
@@ -636,9 +628,11 @@ struct DevirtModule {
std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
PatternList FunctionsToSkip;
+ const bool DevirtSpeculatively;
DevirtModule(Module &M, ModuleAnalysisManager &MAM,
ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary)
+ const ModuleSummaryIndex *ImportSummary,
+ bool DevirtSpeculatively)
: M(M), MAM(MAM),
FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
ExportSummary(ExportSummary), ImportSummary(ImportSummary),
@@ -651,7 +645,8 @@ struct DevirtModule {
RemarksEnabled(areRemarksEnabled()),
OREGetter([&](Function &F) -> OptimizationRemarkEmitter & {
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- }) {
+ }),
+ DevirtSpeculatively(DevirtSpeculatively) {
assert(!(ExportSummary && ImportSummary));
FunctionsToSkip.init(SkipFunctionNames);
}
@@ -765,7 +760,8 @@ struct DevirtModule {
// Lower the module using the action and summary passed as command line
// arguments. For testing purposes only.
- static bool runForTesting(Module &M, ModuleAnalysisManager &MAM);
+ static bool runForTesting(Module &M, ModuleAnalysisManager &MAM,
+ bool DevirtSpeculatively);
};
struct DevirtIndex {
@@ -808,11 +804,22 @@ struct DevirtIndex {
PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
ModuleAnalysisManager &MAM) {
if (UseCommandLine) {
- if (!DevirtModule::runForTesting(M, MAM))
+ if (!DevirtModule::runForTesting(M, MAM, ClDevirtualizeSpeculatively))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
- if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run())
+
+ std::optional<ModuleSummaryIndex> Index;
+ if (!ExportSummary && !ImportSummary && DevirtSpeculatively) {
+ // Build the ExportSummary from the module.
+ assert(!ExportSummary &&
+ "ExportSummary is expected to be empty in non-LTO mode");
+ ProfileSummaryInfo PSI(M);
+ Index.emplace(buildModuleSummaryIndex(M, nullptr, &PSI));
+ ExportSummary = Index.has_value() ? &Index.value() : nullptr;
+ }
+ if (!DevirtModule(M, MAM, ExportSummary, ImportSummary, DevirtSpeculatively)
+ .run())
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
@@ -1010,7 +1017,8 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
return ErrorSuccess();
}
-bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
+bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM,
+ bool DevirtSpeculatively) {
std::unique_ptr<ModuleSummaryIndex> Summary =
std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
@@ -1039,7 +1047,8 @@ bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
ClSummaryAction == PassSummaryAction::Export ? Summary.get()
: nullptr,
ClSummaryAction == PassSummaryAction::Import ? Summary.get()
- : nullptr)
+ : nullptr,
+ DevirtSpeculatively)
.run();
if (!ClWriteSummary.empty()) {
@@ -1103,10 +1112,10 @@ bool DevirtModule::tryFindVirtualCallTargets(
if (!TM.Bits->GV->isConstant())
return false;
- // Without ClDevirtualizeSpeculatively, we cannot perform whole program
+ // Without DevirtSpeculatively, we cannot perform whole program
// devirtualization analysis on a vtable with public LTO visibility.
- if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() ==
- GlobalObject::VCallVisibilityPublic)
+ if (!DevirtSpeculatively && TM.Bits->GV->getVCallVisibility() ==
+ GlobalObject::VCallVisibilityPublic)
return false;
Function *Fn = nullptr;
@@ -1127,7 +1136,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
// In most cases empty functions will be overridden by the
// implementation of the derived class, so we can skip them.
- if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() &&
+ if (DevirtSpeculatively && Fn->getReturnType()->isVoidTy() &&
Fn->getInstructionCount() <= 1)
continue;
@@ -1216,15 +1225,13 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
continue;
// Stop when the number of devirted calls reaches the cutoff.
- if (WholeProgramDevirtCutoff.getNumOccurrences() > 0 &&
- NumDevirtCalls >= WholeProgramDevirtCutoff)
- return;
+ if (!DebugCounter::shouldExecute(CallsToDevirt))
+ continue;
if (RemarksEnabled)
VCallSite.emitRemark("single-impl",
TheFn->stripPointerCasts()->getName(), OREGetter);
NumSingleImpl++;
- NumDevirtCalls++;
auto &CB = VCallSite.CB;
assert(!CB.getCalledFunction() && "devirtualizing direct call?");
IRBuilder<> Builder(&CB);
@@ -1250,8 +1257,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
// add support to compare the virtual function pointer to the
// devirtualized target. In case of a mismatch, fall back to indirect
// call.
- if (DevirtCheckMode == WPDCheckMode::Fallback ||
- ClDevirtualizeSpeculatively) {
+ if (DevirtCheckMode == WPDCheckMode::Fallback || DevirtSpeculatively) {
MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights();
// Version the indirect call site. If the called value is equal to the
// given callee, 'NewInst' will be executed, otherwise the original call
@@ -2375,7 +2381,7 @@ bool DevirtModule::run() {
Function *PublicTypeTestFunc = nullptr;
// If we are in speculative devirtualization mode, we can work on the public
// type test intrinsics.
- if (ClDevirtualizeSpeculatively)
+ if (DevirtSpeculatively)
PublicTypeTestFunc =
Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
Function *TypeTestFunc =
@@ -2511,7 +2517,7 @@ bool DevirtModule::run() {
// Out of speculative devirtualization mode, Try to apply virtual constant
// propagation or branch funneling.
// TODO: This should eventually be enabled for non-public type tests.
- if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) {
+ if (!SingleImplDevirt && !DevirtSpeculatively) {
DidVirtualConstProp |=
tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);