10 files changed, 604 insertions, 190 deletions
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5048561..074d8ed 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final
       return true;
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
 // ------------------------ Align Argument Attribute ------------------------
 
 namespace {
+
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,35 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
       TrackUse = true;
     return 0;
   }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::ptrmask: {
+      // Is it appropriate to pull attribute in initialization?
+      const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+          QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+      const auto *AlignAA = A.getAAFor<AAAlign>(
+          QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+      if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+        unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+                                       Value::MaxAlignmentExponent);
+        Align ConstAlign(UINT64_C(1) << ShiftValue);
+        if (ConstAlign >= AlignAA->getKnownAlign())
+          return Align(1).value();
+      }
+      if (AlignAA)
+        return AlignAA->getKnownAlign().value();
+      break;
+    }
+    case Intrinsic::amdgcn_make_buffer_rsrc: {
+      const auto *AlignAA = A.getAAFor<AAAlign>(
+          QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+      if (AlignAA)
+        return AlignAA->getKnownAlign().value();
+      break;
+    }
+    default:
+      break;
+    }
 
   MaybeAlign MA;
   if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5529,56 @@ struct AAAlignCallSiteReturned final
   AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = getIRPosition().getCtxI();
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::ptrmask: {
+        Align Alignment;
+        bool Valid = false;
+
+        const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+            *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+        if (ConstVals && ConstVals->isValidState()) {
+          unsigned ShiftValue =
+              std::min(ConstVals->getAssumedMinTrailingZeros(),
+                       Value::MaxAlignmentExponent);
+          Alignment = Align(UINT64_C(1) << ShiftValue);
+          Valid = true;
+        }
+
+        const auto *AlignAA =
+            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+                                DepClassTy::REQUIRED);
+        if (AlignAA) {
+          Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+          Valid = true;
+        }
+
+        if (Valid)
+          return clampStateAndIndicateChange<StateType>(
+              this->getState(),
+              std::min(this->getAssumedAlign(), Alignment).value());
+        break;
+      }
+      // FIXME: Should introduce target specific sub-attributes and letting
+      // getAAfor<AAAlign> lead to create sub-attribute to handle target
+      // specific intrinsics.
+      case Intrinsic::amdgcn_make_buffer_rsrc: {
+        const auto *AlignAA =
+            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+                                DepClassTy::REQUIRED);
+        if (AlignAA)
+          return clampStateAndIndicateChange<StateType>(
+              this->getState(), AlignAA->getAssumedAlign().value());
+        break;
+      }
+      default:
+        break;
+      }
+    }
+    return Base::updateImpl(A);
+  };
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
@@ -10701,7 +10781,7 @@ struct AAInterFnReachabilityFunction
     auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this);
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 6a11aec..4863d6b 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -422,6 +422,13 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
   assert(VariadicWrapperDefine == VariadicWrapper);
   assert(!VariadicWrapper->isDeclaration());
 
+  // Add the prof metadata from the original function to the wrapper. Because
+  // FixedArityReplacement is the owner of original function's prof metadata
+  // after the splice, we need to transfer it to VariadicWrapper.
+  VariadicWrapper->setMetadata(
+      LLVMContext::MD_prof,
+      FixedArityReplacement->getMetadata(LLVMContext::MD_prof));
+
   // We now have:
   // 1. the original function, now as a declaration with no uses
   // 2. a variadic function that unconditionally calls a fixed arity replacement
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 99c4982..66b95fc 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -99,6 +99,11 @@ static cl::opt<bool>
                                    "functions from non-versioned callers."),
                           cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned> MaxIFuncVersions(
+    "max-ifunc-versions", cl::Hidden, cl::init(5),
+    cl::desc("Maximum number of caller/callee versions that is allowed for "
+             "using the expensive (cubic) static resolution algorithm."));
+
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
                            cl::desc("Enable stress test of coldcc by adding "
@@ -2018,12 +2023,15 @@ OptimizeFunctions(Module &M,
 
     if (hasChangeableCC(&F, ChangeableCCCache)) {
       // If this function has a calling convention worth changing, is not a
-      // varargs function, and is only called directly, promote it to use the
-      // Fast calling convention.
-      F.setCallingConv(CallingConv::Fast);
-      ChangeCalleesToFastCall(&F);
-      ++NumFastCallFns;
-      Changed = true;
+      // varargs function, is only called directly, and is supported by the
+      // target, promote it to use the Fast calling convention.
+      TargetTransformInfo &TTI = GetTTI(F);
+      if (TTI.useFastCCForInternalCall(F)) {
+        F.setCallingConv(CallingConv::Fast);
+        ChangeCalleesToFastCall(&F);
+        ++NumFastCallFns;
+        Changed = true;
+      }
     }
 
     if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) &&
@@ -2488,20 +2496,21 @@ DeleteDeadIFuncs(Module &M,
 // Follows the use-def chain of \p V backwards until it finds a Function,
 // in which case it collects in \p Versions. Return true on successful
 // use-def chain traversal, false otherwise.
-static bool collectVersions(TargetTransformInfo &TTI, Value *V,
-                            SmallVectorImpl<Function *> &Versions) {
+static bool
+collectVersions(Value *V, SmallVectorImpl<Function *> &Versions,
+                function_ref<TargetTransformInfo &(Function &)> GetTTI) {
   if (auto *F = dyn_cast<Function>(V)) {
-    if (!TTI.isMultiversionedFunction(*F))
+    if (!GetTTI(*F).isMultiversionedFunction(*F))
       return false;
     Versions.push_back(F);
   } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
-    if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+    if (!collectVersions(Sel->getTrueValue(), Versions, GetTTI))
       return false;
-    if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+    if (!collectVersions(Sel->getFalseValue(), Versions, GetTTI))
       return false;
   } else if (auto *Phi = dyn_cast<PHINode>(V)) {
     for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
-      if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+      if (!collectVersions(Phi->getIncomingValue(I), Versions, GetTTI))
         return false;
   } else {
     // Unknown instruction type. Bail.
@@ -2510,31 +2519,45 @@ static bool collectVersions(TargetTransformInfo &TTI, Value *V,
   return true;
 }
 
-// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
-// deduce whether the optimization is legal we need to compare the target
-// features between caller and callee versions. The criteria for bypassing
-// the resolver are the following:
-//
-// * If the callee's feature set is a subset of the caller's feature set,
-//   then the callee is a candidate for direct call.
-//
-// * Among such candidates the one of highest priority is the best match
-//   and it shall be picked, unless there is a version of the callee with
-//   higher priority than the best match which cannot be picked from a
-//   higher priority caller (directly or through the resolver).
-//
-// * For every higher priority callee version than the best match, there
-//   is a higher priority caller version whose feature set availability
-//   is implied by the callee's feature set.
+// Try to statically resolve calls to versioned functions when possible. First
+// we identify the function versions which are associated with an IFUNC symbol.
+// We do that by examining the resolver function of the IFUNC. Once we have
+// collected all the function versions, we sort them in decreasing priority
+// order. This is necessary for determining the most suitable callee version
+// for each caller version. We then collect all the callsites to versioned
+// functions. The static resolution is performed by comparing the feature sets
+// between callers and callees. Specifically:
+// * Start a walk over caller and callee lists simultaneously in order of
+//   decreasing priority.
+// * Statically resolve calls from the current caller to the current callee,
+//   iff the caller feature bits are a superset of the callee feature bits.
+// * For FMV callers, as long as the caller feature bits are a subset of the
+//   callee feature bits, advance to the next callee. This effectively prevents
+//   considering the current callee as a candidate for static resolution by
+//   following callers (explanation: preceding callers would not have been
+//   selected in a hypothetical runtime execution).
+// * Advance to the next caller.
 //
+// Presentation in EuroLLVM2025:
+// https://www.youtube.com/watch?v=k54MFimPz-A&t=867s
 static bool OptimizeNonTrivialIFuncs(
     Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
   bool Changed = false;
 
-  // Cache containing the mask constructed from a function's target features.
+  // Map containing the feature bits for a given function.
   DenseMap<Function *, APInt> FeatureMask;
+  // Map containing the priority bits for a given function.
+  DenseMap<Function *, APInt> PriorityMask;
+  // Map containing all the function versions corresponding to an IFunc symbol.
+  DenseMap<GlobalIFunc *, SmallVector<Function *>> VersionedFuncs;
+  // Map containing the IFunc symbol a function is version of.
+  DenseMap<Function *, GlobalIFunc *> VersionOf;
+  // List of all the interesting IFuncs found in the module.
+  SmallVector<GlobalIFunc *> IFuncs;
 
   for (GlobalIFunc &IF : M.ifuncs()) {
+    LLVM_DEBUG(dbgs() << "Examining IFUNC " << IF.getName() << "\n");
+
     if (IF.isInterposable())
       continue;
 
@@ -2545,107 +2568,175 @@ static bool OptimizeNonTrivialIFuncs(
     if (Resolver->isInterposable())
       continue;
 
-    TargetTransformInfo &TTI = GetTTI(*Resolver);
-
-    // Discover the callee versions.
-    SmallVector<Function *> Callees;
-    if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+    SmallVector<Function *> Versions;
+    // Discover the versioned functions.
+    if (any_of(*Resolver, [&](BasicBlock &BB) {
           if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
-            if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+            if (!collectVersions(Ret->getReturnValue(), Versions, GetTTI))
               return true;
           return false;
         }))
       continue;
 
-    if (Callees.empty())
+    if (Versions.empty())
       continue;
 
-    LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
-                      << Resolver->getName() << "\n");
-
-    // Cache the feature mask for each callee.
-    for (Function *Callee : Callees) {
-      auto [It, Inserted] = FeatureMask.try_emplace(Callee);
-      if (Inserted)
-        It->second = TTI.getFeatureMask(*Callee);
+    for (Function *V : Versions) {
+      VersionOf.insert({V, &IF});
+      auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(V);
+      if (FeatInserted)
+        FeatIt->second = GetTTI(*V).getFeatureMask(*V);
+      auto [PriorIt, PriorInserted] = PriorityMask.try_emplace(V);
+      if (PriorInserted)
+        PriorIt->second = GetTTI(*V).getPriorityMask(*V);
     }
 
-    // Sort the callee versions in decreasing priority order.
-    sort(Callees, [&](auto *LHS, auto *RHS) {
-      return FeatureMask[LHS].ugt(FeatureMask[RHS]);
+    // Sort function versions in decreasing priority order.
+    sort(Versions, [&](auto *LHS, auto *RHS) {
+      return PriorityMask[LHS].ugt(PriorityMask[RHS]);
     });
 
-    // Find the callsites and cache the feature mask for each caller.
-    SmallVector<Function *> Callers;
+    IFuncs.push_back(&IF);
+    VersionedFuncs.try_emplace(&IF, std::move(Versions));
+  }
+
+  for (GlobalIFunc *CalleeIF : IFuncs) {
+    SmallVector<Function *> NonFMVCallers;
+    DenseSet<GlobalIFunc *> CallerIFuncs;
     DenseMap<Function *, SmallVector<CallBase *>> CallSites;
-    for (User *U : IF.users()) {
+
+    // Find the callsites.
+    for (User *U : CalleeIF->users()) {
       if (auto *CB = dyn_cast<CallBase>(U)) {
-        if (CB->getCalledOperand() == &IF) {
+        if (CB->getCalledOperand() == CalleeIF) {
           Function *Caller = CB->getFunction();
-          auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
-          if (FeatInserted)
-            FeatIt->second = TTI.getFeatureMask(*Caller);
-          auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
-          if (CallInserted)
-            Callers.push_back(Caller);
-          CallIt->second.push_back(CB);
+          GlobalIFunc *CallerIF = nullptr;
+          TargetTransformInfo &TTI = GetTTI(*Caller);
+          bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller);
+          // The caller is a version of a known IFunc.
+          if (auto It = VersionOf.find(Caller); It != VersionOf.end())
+            CallerIF = It->second;
+          else if (!CallerIsFMV && OptimizeNonFMVCallers) {
+            // The caller is non-FMV.
+            auto [It, Inserted] = FeatureMask.try_emplace(Caller);
+            if (Inserted)
+              It->second = TTI.getFeatureMask(*Caller);
+          } else
+            // The caller is none of the above, skip.
+            continue;
+          auto [It, Inserted] = CallSites.try_emplace(Caller);
+          if (Inserted) {
+            if (CallerIsFMV)
+              CallerIFuncs.insert(CallerIF);
+            else
+              NonFMVCallers.push_back(Caller);
+          }
+          It->second.push_back(CB);
         }
       }
     }
 
-    // Sort the caller versions in decreasing priority order.
-    sort(Callers, [&](auto *LHS, auto *RHS) {
-      return FeatureMask[LHS].ugt(FeatureMask[RHS]);
-    });
+    if (CallSites.empty())
+      continue;
 
-    auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); };
-
-    // Index to the highest priority candidate.
-    unsigned I = 0;
-    // Now try to redirect calls starting from higher priority callers.
-    for (Function *Caller : Callers) {
-      assert(I < Callees.size() && "Found callers of equal priority");
-
-      Function *Callee = Callees[I];
-      APInt CallerBits = FeatureMask[Caller];
-      APInt CalleeBits = FeatureMask[Callee];
-
-      // In the case of FMV callers, we know that all higher priority callers
-      // than the current one did not get selected at runtime, which helps
-      // reason about the callees (if they have versions that mandate presence
-      // of the features which we already know are unavailable on this target).
-      if (TTI.isMultiversionedFunction(*Caller)) {
-        // If the feature set of the caller implies the feature set of the
-        // highest priority candidate then it shall be picked. In case of
-        // identical sets advance the candidate index one position.
-        if (CallerBits == CalleeBits)
-          ++I;
-        else if (!implies(CallerBits, CalleeBits)) {
-          // Keep advancing the candidate index as long as the caller's
-          // features are a subset of the current candidate's.
-          while (implies(CalleeBits, CallerBits)) {
-            if (++I == Callees.size())
-              break;
-            CalleeBits = FeatureMask[Callees[I]];
+    LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
+                      << CalleeIF->getResolverFunction()->getName() << "\n");
+
+    // The complexity of this algorithm is linear: O(NumCallers + NumCallees)
+    // if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions,
+    // otherwise it is cubic: O((NumCallers ^ 2) x NumCallees).
+    auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
+                                      ArrayRef<Function *> Callees,
+                                      bool CallerIsFMV) {
+      bool AllowExpensiveChecks = CallerIsFMV &&
+                                  Callers.size() <= MaxIFuncVersions &&
+                                  Callees.size() <= MaxIFuncVersions;
+      // Index to the highest callee candidate.
+      unsigned J = 0;
+
+      for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
+        // There are no callee candidates left.
+        if (J == Callees.size())
+          break;
+
+        Function *Caller = Callers[I];
+        APInt CallerBits = FeatureMask[Caller];
+
+        // Compare the feature bits of the best callee candidate with all the
+        // caller versions preceeding the current one. For each prior caller
+        // discard feature bits that are known to be available in the current
+        // caller. As long as the known missing feature bits are a subset of the
+        // callee feature bits, advance to the next callee and start over.
+        auto eliminateAvailableFeatures = [&](unsigned BestCandidate) {
+          unsigned K = 0;
+          while (K < I && BestCandidate < Callees.size()) {
+            APInt MissingBits = FeatureMask[Callers[K]] & ~CallerBits;
+            if (MissingBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
+              ++BestCandidate;
+              // Start over.
+              K = 0;
+            } else
+              ++K;
           }
+          return BestCandidate;
+        };
+
+        unsigned BestCandidate =
+            AllowExpensiveChecks ? eliminateAvailableFeatures(J) : J;
+        // No callee candidate was found for this caller.
+        if (BestCandidate == Callees.size())
           continue;
+
+        LLVM_DEBUG(dbgs() << "   Examining "
+                          << (CallerIsFMV ? "FMV" : "regular") << " caller "
+                          << Caller->getName() << "\n");
+
+        Function *Callee = Callees[BestCandidate];
+        APInt CalleeBits = FeatureMask[Callee];
+
+        // Statically resolve calls from the current caller to the current
+        // callee, iff the caller feature bits are a superset of the callee
+        // feature bits.
+        if (CalleeBits.isSubsetOf(CallerBits)) {
+          // Not all caller versions are necessarily users of the callee IFUNC.
+          if (auto It = CallSites.find(Caller); It != CallSites.end()) {
+            for (CallBase *CS : It->second) {
+              LLVM_DEBUG(dbgs() << "   Redirecting call " << Caller->getName()
+                                << " -> " << Callee->getName() << "\n");
+              CS->setCalledOperand(Callee);
+            }
+            Changed = true;
+          }
         }
-      } else {
-        // We can't reason much about non-FMV callers. Just pick the highest
-        // priority callee if it matches, otherwise bail.
-        if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits))
+
+        // Nothing else to do about non-FMV callers.
+        if (!CallerIsFMV)
           continue;
+
+        // For FMV callers, as long as the caller feature bits are a subset of
+        // the callee feature bits, advance to the next callee. This effectively
+        // prevents considering the current callee as a candidate for static
+        // resolution by following callers.
+        while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) &&
+               ++J < Callees.size())
+          ;
       }
-      auto &Calls = CallSites[Caller];
-      for (CallBase *CS : Calls) {
-        LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> "
-                          << Callee->getName() << "\n");
-        CS->setCalledOperand(Callee);
-      }
-      Changed = true;
+    };
+
+    auto &Callees = VersionedFuncs[CalleeIF];
+
+    // Optimize non-FMV calls.
+    if (OptimizeNonFMVCallers)
+      staticallyResolveCalls(NonFMVCallers, Callees, /*CallerIsFMV=*/false);
+
+    // Optimize FMV calls.
+    for (GlobalIFunc *CallerIF : CallerIFuncs) {
+      auto &Callers = VersionedFuncs[CallerIF];
+      staticallyResolveCalls(Callers, Callees, /*CallerIsFMV=*/true);
     }
-    if (IF.use_empty() ||
-        all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+
+    if (CalleeIF->use_empty() ||
+        all_of(CalleeIF->users(), [](User *U) { return isa<GlobalAlias>(U); }))
       NumIFuncsResolved++;
   }
   return Changed;
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index fdf0c3a..6e1ca9c 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2108,8 +2108,7 @@ static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
 
   for (Value *RetVal : SortedKeys) {
     BasicBlock *NewBB = BasicBlock::Create(
-        ParentFunc->getContext(),
-        Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+        ParentFunc->getContext(), Twine(BaseName) + Twine("_") + Twine(Idx++),
         ParentFunc);
     NewMap.insert(std::make_pair(RetVal, NewBB));
   }
@@ -2286,9 +2285,9 @@ void IROutliner::deduplicateExtractedSections(
     // Create a set of BasicBlocks, one for each return block, to hold the
     // needed store instructions.
     DenseMap<Value *, BasicBlock *> NewBBs;
-    createAndInsertBasicBlocks(
-        CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
-        "output_block_" + Twine(static_cast<unsigned>(Idx)));
+    createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs,
+                               CurrentGroup.OutlinedFunction,
+                               "output_block_" + Twine(Idx));
     replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
     alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
                                 CurrentGroup.EndBBs, OutputMappings,
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index aa1346d..06deea8 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -48,12 +48,14 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -78,7 +80,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <memory>
 #include <set>
 #include <string>
 #include <system_error>
@@ -803,7 +804,9 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
         return createBitSetTest(ThenB, TIL, BitOffset);
       }
 
-  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
+  MDBuilder MDB(M.getContext());
+  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false,
+                                              MDB.createLikelyBranchWeights()));
 
   // Now that we know that the offset is in range and aligned, load the
   // appropriate bit from the bitset.
@@ -1470,6 +1473,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
                                      Constant::getNullValue(F->getType()));
     Value *Select = Builder.CreateSelect(ICmp, JT,
                                          Constant::getNullValue(F->getType()));
+
+    if (auto *SI = dyn_cast<SelectInst>(Select))
+      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
     // For phi nodes, we need to update the incoming value for all operands
     // with the same predecessor.
     if (PN)
@@ -1552,12 +1558,7 @@ void LowerTypeTestsModule::createJumpTable(
 
   // Align the whole table by entry size.
   F->setAlignment(Align(getJumpTableEntrySize(JumpTableArch)));
-  // Skip prologue.
-  // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
-  // Luckily, this function does not get any prologue even without the
-  // attribute.
-  if (OS != Triple::Win32)
-    F->addFnAttr(Attribute::Naked);
+  F->addFnAttr(Attribute::Naked);
   if (JumpTableArch == Triple::arm)
     F->addFnAttr("target-features", "-thumb-mode");
   if (JumpTableArch == Triple::thumb) {
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 894d83f..0f4bc64 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
 STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
 STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
 STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+cl::opt<unsigned> MemProfTopNImportant(
+    "memprof-top-n-important", cl::init(10), cl::Hidden,
+    cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+    "memprof-fixup-important", cl::init(true), cl::Hidden,
+    cl::desc("Enables edge fixup for important contexts"));
+
 } // namespace llvm
 
 namespace {
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ protected:
 
   /// Adds nodes for the given MIB stack ids.
   template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType,
-                           ArrayRef<ContextTotalSize> ContextSizeInfo);
+  void addStackNodesForMIB(
+      ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+      CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+      ArrayRef<ContextTotalSize> ContextSizeInfo,
+      std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
   /// inlining performed on those callsite instructions.
   void updateStackNodes();
 
+  /// Optionally fixup edges for the N largest cold contexts to better enable
+  /// cloning. This is particularly helpful if the context includes recursion
+  /// as well as inlining, resulting in a single stack node for multiple stack
+  /// ids in the context. With recursion it is particularly difficult to get the
+  /// edge updates correct as in the general case we have lost the original
+  /// stack id ordering for the context. Do more expensive fixup for the largest
+  /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+  void fixupImportantContexts();
+
   /// Update graph to conservatively handle any callsite stack nodes that target
   /// multiple different callee target functions.
   void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ private:
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
-      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+      const DenseSet<uint32_t> &ImportantContextIds);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ private:
   /// nodes.
   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
 
+  /// Saves information for the contexts identified as important (the largest
+  /// cold contexts up to MemProfTopNImportant).
+  struct ImportantContextInfo {
+    // The original list of leaf first stack ids corresponding to this context.
+    std::vector<uint64_t> StackIds;
+    // Max length of stack ids corresponding to a single stack ContextNode for
+    // this context (i.e. the max length of a key in StackIdsToNode below).
+    unsigned MaxLength = 0;
+    // Mapping of slices of the stack ids to the corresponding ContextNode
+    // (there can be multiple stack ids due to inlining). Populated when
+    // updating stack nodes while matching them to the IR or summary.
+    std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+  };
+
+  // Map of important full context ids to information about each.
+  DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+  // For each important context id found in Node (if any), records the list of
+  // stack ids that corresponded to the given callsite Node. There can be more
+  // than one in the case of inlining.
+  void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+                       // We pass in the Node's context ids to avoid the
+                       // overhead of computing them as the caller already has
+                       // them in some cases.
+                       const DenseSet<uint32_t> &NodeContextIds,
+                       const DenseSet<uint32_t> &ImportantContextIds) {
+    if (!MemProfTopNImportant) {
+      assert(ImportantContextIds.empty());
+      return;
+    }
+    DenseSet<uint32_t> Ids =
+        set_intersection(NodeContextIds, ImportantContextIds);
+    if (Ids.empty())
+      return;
+    auto Size = StackIds.size();
+    for (auto Id : Ids) {
+      auto &Entry = ImportantContextIdInfo[Id];
+      Entry.StackIdsToNode[StackIds] = Node;
+      // Keep track of the max to simplify later analysis.
+      if (Size > Entry.MaxLength)
+        Entry.MaxLength = Size;
+    }
+  }
+
   /// Maps to track the calls to their corresponding nodes in the graph.
   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1034,11 +1101,11 @@ private:
 } // namespace
 
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
 template <>
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    ArrayRef<ContextTotalSize> ContextSizeInfo) {
+    ArrayRef<ContextTotalSize> ContextSizeInfo,
+    std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  bool IsImportant = false;
   if (!ContextSizeInfo.empty()) {
     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    // If this is a cold allocation, and we are collecting non-zero largest
+    // contexts, see if this is a candidate.
+    if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+      uint64_t TotalCold = 0;
+      for (auto &CSI : ContextSizeInfo)
+        TotalCold += CSI.TotalSize;
+      // Record this context if either we haven't found the first top-n largest
+      // yet, or if it is larger than the smallest already recorded.
+      if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+          // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+          // sorted in ascending size of its key which is the size.
+          TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+        if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+          // Remove old one and its associated entries.
+          auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+          TotalSizeToContextIdTopNCold.erase(
+              TotalSizeToContextIdTopNCold.begin());
+          assert(ImportantContextIdInfo.count(IdToRemove));
+          ImportantContextIdInfo.erase(IdToRemove);
+        }
+        TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+        IsImportant = true;
+      }
+    }
     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
@@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
        ContextIter != StackContext.end(); ++ContextIter) {
     auto StackId = getStackId(*ContextIter);
+    if (IsImportant)
+      ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
       StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(
-        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-        DenseMap<uint64_t, std::vector<CallContextInfo>>
-            &StackIdToMatchingCalls,
-        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls,
+                              DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+                              const DenseSet<uint32_t> &ImportantContextIds) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
     }
     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       Node->setCall(Call);
       NonAllocationCallToContextNodeMap[Call] = Node;
       NodeToCallingFunc[Node] = Func;
+      recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
       return;
     }
   }
@@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
                                 : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+
+    recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
     if (VerifyNodes) {
       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
       for (auto Id : Ids) {
@@ -1799,6 +1899,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::fixupImportantContexts() {
+  if (ImportantContextIdInfo.empty())
+    return;
+
+  // Update statistics as we are done building this map at this point.
+  NumImportantContextIds = ImportantContextIdInfo.size();
+
+  if (!MemProfFixupImportant)
+    return;
+
+  if (ExportToDot)
+    exportToDot("beforestackfixup");
+
+  // For each context we identified as important, walk through the saved context
+  // stack ids in order from leaf upwards, and make sure all edges are correct.
+  // These can be difficult to get right when updating the graph while mapping
+  // nodes onto summary or IR, especially when there is recursion. In
+  // particular, when we have created new nodes to reflect inlining, it is
+  // sometimes impossible to know exactly how to update the edges in the face of
+  // recursion, as we have lost the original ordering of the stack ids in the
+  // contexts.
+  // TODO: Consider only doing this if we detect the context has recursive
+  // cycles.
+  //
+  // I.e. assume we have a context with stack ids like: {A B A C A D E}
+  // and let's say A was inlined into B, C, and D. The original graph will have
+  // multiple recursive cycles through A. When we match the original context
+  // nodes onto the IR or summary, we will merge {A B} into one context node,
+  // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+  // above, we should end up with a non-cyclic set of edges like:
+  // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+  // original ordering, we won't get the edges correct initially (it's
+  // impossible without the original ordering). Here we do the fixup (add and
+  // removing edges where necessary) for this context. In the
+  // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+  // and map entries for {A B}, {A C}, {A D}, and {E}.
+  for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
+    if (Info.StackIdsToNode.empty())
+      continue;
+    bool Changed = false;
+    ContextNode *PrevNode = nullptr;
+    ContextNode *CurNode = nullptr;
+    DenseSet<const ContextEdge *> VisitedEdges;
+    ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+    // Try to identify what callsite ContextNode maps to which slice of the
+    // context's ordered stack ids.
+    for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+      // We will do this greedily, trying up to MaxLength stack ids in a row, to
+      // see if we recorded a context node for that sequence.
+      auto Len = Info.MaxLength;
+      auto LenToEnd = AllStackIds.size() - I;
+      if (Len > LenToEnd)
+        Len = LenToEnd;
+      CurNode = nullptr;
+      // Try to find a recorded context node starting with the longest length
+      // recorded, and on down until we check for just a single stack node.
+      for (; Len > 0; Len--) {
+        // Get the slice of the original stack id sequence to check.
+        auto CheckStackIds = AllStackIds.slice(I, Len);
+        auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+        if (EntryIt == Info.StackIdsToNode.end())
+          continue;
+        CurNode = EntryIt->second;
+        // Skip forward so we don't try to look for the ones we just matched.
+        // We increment by Len - 1, because the outer for loop will increment I.
+        I += Len - 1;
+        break;
+      }
+      // Give up if we couldn't find a node. Since we need to clone from the
+      // leaf allocation upwards, no sense in doing anymore fixup further up
+      // the context if we couldn't match part of the original stack context
+      // onto a callsite node.
+      if (!CurNode)
+        break;
+      // No edges to fix up until we have a pair of nodes that should be
+      // adjacent in the graph.
+      if (!PrevNode)
+        continue;
+      // See if we already have a call edge from CurNode to PrevNode.
+      auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+      if (CurEdge) {
+        // We already have an edge. Make sure it contains this context id.
+        if (CurEdge->getContextIds().insert(CurContextId).second) {
+          NumFixupEdgeIdsInserted++;
+          Changed = true;
+        }
+      } else {
+        // No edge exists - add one.
+        NumFixupEdgesAdded++;
+        DenseSet<uint32_t> ContextIds({CurContextId});
+        auto AllocType = computeAllocType(ContextIds);
+        auto NewEdge = std::make_shared<ContextEdge>(
+            PrevNode, CurNode, AllocType, std::move(ContextIds));
+        PrevNode->CallerEdges.push_back(NewEdge);
+        CurNode->CalleeEdges.push_back(NewEdge);
+        // Save the new edge for the below handling.
+        CurEdge = NewEdge.get();
+        Changed = true;
+      }
+      VisitedEdges.insert(CurEdge);
+      // Now remove this context id from any other caller edges calling
+      // PrevNode.
+      for (auto &Edge : PrevNode->CallerEdges) {
+        // Skip the edge updating/created above and edges we have already
+        // visited (due to recursion).
+        if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+          Edge->getContextIds().erase(CurContextId);
+      }
+    }
+    if (Changed)
+      NumFixedContexts++;
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // Map of stack id to all calls with that as the last (outermost caller)
   // callsite id that has a context node (some might not due to pruning
@@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // nodes representing any inlining at interior callsites. Note we move the
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
+  DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+                                         ImportantContextIdInfo.keys());
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
+
+  fixupImportantContexts();
+
   if (VerifyCCG)
     check();
 }
@@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
     Module &M,
     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
     : Mod(M), OREGetter(OREGetter) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), ContextSizeInfo);
+                getMIBAllocType(MIBMD), ContextSizeInfo,
+                TotalSizeToContextIdTopNCold);
           }
           // If exporting the graph to dot and an allocation id of interest was
           // specified, record all the context ids for this allocation node.
@@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing)
     : Index(Index), isPrevailing(isPrevailing) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &I : Index) {
     auto VI = Index.getValueInfo(I);
     for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                ContextSizeInfo);
+                ContextSizeInfo, TotalSizeToContextIdTopNCold);
             I++;
           }
           // If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index d7eb745..1f52f9a 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -208,7 +208,7 @@ namespace KernelInfo {
 // };
 
 #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)                                    \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_IDX(Configuration, 0)
 KERNEL_ENVIRONMENT_IDX(Ident, 1)
@@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 #undef KERNEL_ENVIRONMENT_IDX
 
 #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)                      \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
@@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 GlobalVariable *
 getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
-  constexpr const int InitKernelEnvironmentArgNo = 0;
+  constexpr int InitKernelEnvironmentArgNo = 0;
   return cast<GlobalVariable>(
       KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
           ->stripPointerCasts());
@@ -730,7 +730,7 @@ struct KernelInfoState : AbstractState {
 
   /// State to indicate if we can track parallel level of the associated
   /// function. We will give up tracking if we encounter unknown caller or the
-  /// caller is __kmpc_parallel_51.
+  /// caller is __kmpc_parallel_60.
   BooleanStateWithSetVector<uint8_t> ParallelLevels;
 
   /// Flag that indicates if the kernel has nested Parallelism
@@ -2117,8 +2117,8 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
         return getUniqueKernelFor(*CB);
 
       OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
-          OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
-      // Allow the use in __kmpc_parallel_51 calls.
+          OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
+      // Allow the use in __kmpc_parallel_60 calls.
       if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
         return getUniqueKernelFor(*CB);
       return nullptr;
@@ -2145,7 +2145,7 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
 
 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
   OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
-      OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+      OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
 
   bool Changed = false;
   if (!KernelParallelRFI)
@@ -2157,7 +2157,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
 
   for (Function *F : SCC) {
 
-    // Check if the function is a use in a __kmpc_parallel_51 call at
+    // Check if the function is a use in a __kmpc_parallel_60 call at
     // all.
     bool UnknownUse = false;
     bool KernelParallelUse = false;
@@ -2189,7 +2189,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
       UnknownUse = true;
     });
 
-    // Do not emit a remark if we haven't seen a __kmpc_parallel_51
+    // Do not emit a remark if we haven't seen a __kmpc_parallel_60
     // use.
     if (!KernelParallelUse)
       continue;
@@ -2208,7 +2208,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
       continue;
     }
 
-    // Even if we have __kmpc_parallel_51 calls, we (for now) give
+    // Even if we have __kmpc_parallel_60 calls, we (for now) give
     // up if the function is not called from a unique kernel.
     Kernel K = getUniqueKernelFor(*F);
     if (!K) {
@@ -4457,7 +4457,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     Instruction *IsWorker =
         ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
-                         ConstantInt::get(KernelInitCB->getType(), -1),
+                         ConstantInt::getAllOnesValue(KernelInitCB->getType()),
                          "thread.is_worker", InitBB);
     IsWorker->setDebugLoc(DLoc);
     BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
@@ -4821,8 +4821,8 @@ private:
   /// Update info regarding parallel levels.
   void updateParallelLevels(Attributor &A) {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-    OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
-        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+    OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
 
     auto PredCallSite = [&](AbstractCallSite ACS) {
       Function *Caller = ACS.getInstruction()->getFunction();
@@ -4832,12 +4832,12 @@ private:
       auto *CAA =
           A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
       if (CAA && CAA->ParallelLevels.isValidState()) {
-        // Any function that is called by `__kmpc_parallel_51` will not be
+        // Any function that is called by `__kmpc_parallel_60` will not be
         // folded as the parallel level in the function is updated. In order to
         // get it right, all the analysis would depend on the implentation. That
         // said, if in the future any change to the implementation, the analysis
         // could be wrong. As a consequence, we are just conservative here.
-        if (Caller == Parallel51RFI.Declaration) {
+        if (Caller == Parallel60RFI.Declaration) {
           ParallelLevels.indicatePessimisticFixpoint();
           return true;
         }
@@ -5006,8 +5006,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPRTL___kmpc_target_deinit:
         KernelDeinitCB = &CB;
         break;
-      case OMPRTL___kmpc_parallel_51:
-        if (!handleParallel51(A, CB))
+      case OMPRTL___kmpc_parallel_60:
+        if (!handleParallel60(A, CB))
           indicatePessimisticFixpoint();
         return;
       case OMPRTL___kmpc_omp_task:
@@ -5075,8 +5075,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
         return indicatePessimisticFixpoint();
 
       CallBase &CB = cast<CallBase>(getAssociatedValue());
-      if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
-        if (!handleParallel51(A, CB))
+      if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
+        if (!handleParallel60(A, CB))
           return indicatePessimisticFixpoint();
         return StateBefore == getState() ? ChangeStatus::UNCHANGED
                                          : ChangeStatus::CHANGED;
@@ -5136,9 +5136,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
                                      : ChangeStatus::CHANGED;
   }
 
-  /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
+  /// Deal with a __kmpc_parallel_60 call (\p CB). Returns true if the call was
   /// handled, if a problem occurred, false is returned.
-  bool handleParallel51(Attributor &A, CallBase &CB) {
+  bool handleParallel60(Attributor &A, CallBase &CB) {
     const unsigned int NonWrapperFunctionArgNo = 5;
     const unsigned int WrapperFunctionArgNo = 6;
     auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index e39e311..8e76b79 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -83,7 +83,6 @@
 #include <cstdint>
 #include <functional>
 #include <limits>
-#include <map>
 #include <memory>
 #include <queue>
 #include <string>
@@ -2293,7 +2292,6 @@ bool SampleProfileLoader::runOnFunction(Function &F,
   // count value.
   if (!F.getEntryCount())
     F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
-  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
                   .getManager();
   ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 70b8614..b9fb7a3 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/LongestCommonSequence.h"
 
+#include <unordered_set>
+
 using namespace llvm;
 using namespace sampleprof;
 
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2dd0fde..7aa90ee 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -99,6 +99,7 @@
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -130,6 +131,8 @@ STATISTIC(NumUniqueRetVal, "Number of unique return value optimizations");
 STATISTIC(NumVirtConstProp1Bit,
           "Number of 1 bit virtual constant propagations");
 STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
+DEBUG_COUNTER(CallsToDevirt, "calls-to-devirt",
+              "Controls how many calls should be devirtualized.");
 
 namespace llvm {
 
@@ -219,14 +222,6 @@ static cl::opt<bool> WholeProgramDevirtKeepUnreachableFunction(
     cl::desc("Regard unreachable functions as possible devirtualize targets."),
     cl::Hidden, cl::init(true));
 
-/// If explicitly specified, the devirt module pass will stop transformation
-/// once the total number of devirtualizations reach the cutoff value. Setting
-/// this option to 0 explicitly will do 0 devirtualization.
-static cl::opt<unsigned> WholeProgramDevirtCutoff(
-    "wholeprogramdevirt-cutoff",
-    cl::desc("Max number of devirtualizations for devirt module pass"),
-    cl::init(0));
-
 /// Mechanism to add runtime checking of devirtualization decisions, optionally
 /// trapping or falling back to indirect call on any that are not correct.
 /// Trapping mode is useful for debugging undefined behavior leading to failures
@@ -377,9 +372,6 @@ VirtualCallTarget::VirtualCallTarget(GlobalValue *Fn, const TypeMemberInfo *TM)
 
 namespace {
 
-// Tracks the number of devirted calls in the IR transformation.
-static unsigned NumDevirtCalls = 0;
-
 // A slot in a set of virtual tables. The TypeID identifies the set of virtual
 // tables, and the ByteOffset is the offset in bytes from the address point to
 // the virtual function pointer.
@@ -636,9 +628,11 @@ struct DevirtModule {
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
   PatternList FunctionsToSkip;
 
+  const bool DevirtSpeculatively;
   DevirtModule(Module &M, ModuleAnalysisManager &MAM,
                ModuleSummaryIndex *ExportSummary,
-               const ModuleSummaryIndex *ImportSummary)
+               const ModuleSummaryIndex *ImportSummary,
+               bool DevirtSpeculatively)
       : M(M), MAM(MAM),
         FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
         ExportSummary(ExportSummary), ImportSummary(ImportSummary),
@@ -651,7 +645,8 @@ struct DevirtModule {
         RemarksEnabled(areRemarksEnabled()),
         OREGetter([&](Function &F) -> OptimizationRemarkEmitter & {
           return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-        }) {
+        }),
+        DevirtSpeculatively(DevirtSpeculatively) {
     assert(!(ExportSummary && ImportSummary));
     FunctionsToSkip.init(SkipFunctionNames);
   }
@@ -765,7 +760,8 @@ struct DevirtModule {
 
   // Lower the module using the action and summary passed as command line
   // arguments. For testing purposes only.
-  static bool runForTesting(Module &M, ModuleAnalysisManager &MAM);
+  static bool runForTesting(Module &M, ModuleAnalysisManager &MAM,
+                            bool DevirtSpeculatively);
 };
 
 struct DevirtIndex {
@@ -808,11 +804,22 @@ struct DevirtIndex {
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
   if (UseCommandLine) {
-    if (!DevirtModule::runForTesting(M, MAM))
+    if (!DevirtModule::runForTesting(M, MAM, ClDevirtualizeSpeculatively))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
   }
-  if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run())
+
+  std::optional<ModuleSummaryIndex> Index;
+  if (!ExportSummary && !ImportSummary && DevirtSpeculatively) {
+    // Build the ExportSummary from the module.
+    assert(!ExportSummary &&
+           "ExportSummary is expected to be empty in non-LTO mode");
+    ProfileSummaryInfo PSI(M);
+    Index.emplace(buildModuleSummaryIndex(M, nullptr, &PSI));
+    ExportSummary = Index.has_value() ? &Index.value() : nullptr;
+  }
+  if (!DevirtModule(M, MAM, ExportSummary, ImportSummary, DevirtSpeculatively)
+           .run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
@@ -1010,7 +1017,8 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
   return ErrorSuccess();
 }
 
-bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
+bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM,
+                                 bool DevirtSpeculatively) {
   std::unique_ptr<ModuleSummaryIndex> Summary =
       std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
 
@@ -1039,7 +1047,8 @@ bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
                    ClSummaryAction == PassSummaryAction::Export ? Summary.get()
                                                                 : nullptr,
                    ClSummaryAction == PassSummaryAction::Import ? Summary.get()
-                                                                : nullptr)
+                                                                : nullptr,
+                   DevirtSpeculatively)
           .run();
 
   if (!ClWriteSummary.empty()) {
@@ -1103,10 +1112,10 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (!TM.Bits->GV->isConstant())
       return false;
 
-    // Without ClDevirtualizeSpeculatively, we cannot perform whole program
+    // Without DevirtSpeculatively, we cannot perform whole program
     // devirtualization analysis on a vtable with public LTO visibility.
-    if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() ==
-                                            GlobalObject::VCallVisibilityPublic)
+    if (!DevirtSpeculatively && TM.Bits->GV->getVCallVisibility() ==
+                                    GlobalObject::VCallVisibilityPublic)
       return false;
 
     Function *Fn = nullptr;
@@ -1127,7 +1136,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
 
     // In most cases empty functions will be overridden by the
     // implementation of the derived class, so we can skip them.
-    if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() &&
+    if (DevirtSpeculatively && Fn->getReturnType()->isVoidTy() &&
         Fn->getInstructionCount() <= 1)
       continue;
 
@@ -1216,15 +1225,13 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
         continue;
 
       // Stop when the number of devirted calls reaches the cutoff.
-      if (WholeProgramDevirtCutoff.getNumOccurrences() > 0 &&
-          NumDevirtCalls >= WholeProgramDevirtCutoff)
-        return;
+      if (!DebugCounter::shouldExecute(CallsToDevirt))
+        continue;
 
       if (RemarksEnabled)
         VCallSite.emitRemark("single-impl",
                              TheFn->stripPointerCasts()->getName(), OREGetter);
       NumSingleImpl++;
-      NumDevirtCalls++;
       auto &CB = VCallSite.CB;
       assert(!CB.getCalledFunction() && "devirtualizing direct call?");
       IRBuilder<> Builder(&CB);
@@ -1250,8 +1257,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
       // add support to compare the virtual function pointer to the
       // devirtualized target. In case of a mismatch, fall back to indirect
       // call.
-      if (DevirtCheckMode == WPDCheckMode::Fallback ||
-          ClDevirtualizeSpeculatively) {
+      if (DevirtCheckMode == WPDCheckMode::Fallback || DevirtSpeculatively) {
         MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights();
         // Version the indirect call site. If the called value is equal to the
         // given callee, 'NewInst' will be executed, otherwise the original call
@@ -2375,7 +2381,7 @@ bool DevirtModule::run() {
   Function *PublicTypeTestFunc = nullptr;
   // If we are in speculative devirtualization mode, we can work on the public
   // type test intrinsics.
-  if (ClDevirtualizeSpeculatively)
+  if (DevirtSpeculatively)
     PublicTypeTestFunc =
         Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   Function *TypeTestFunc =
@@ -2511,7 +2517,7 @@ bool DevirtModule::run() {
       // Out of speculative devirtualization mode, Try to apply virtual constant
       // propagation or branch funneling.
       // TODO: This should eventually be enabled for non-public type tests.
-      if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) {
+      if (!SingleImplDevirt && !DevirtSpeculatively) {
         DidVirtualConstProp |=
             tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);