95 files changed, 2667 insertions, 1441 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 805b682..853bd66 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -122,11 +122,61 @@ static cl::opt<unsigned> MIVMaxLevelThreshold(
     cl::desc("Maximum depth allowed for the recursive algorithm used to "
              "explore MIV direction vectors."));
 
-static cl::opt<bool> RunSIVRoutinesOnly(
-    "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden,
-    cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). "
-             "The purpose is mainly to exclude the influence of those routines "
-             "in regression tests for SIV routines."));
+namespace {
+
+/// Types of dependence test routines.
+enum class DependenceTestType {
+  All,
+  StrongSIV,
+  WeakCrossingSIV,
+  ExactSIV,
+  WeakZeroSIV,
+  ExactRDIV,
+  SymbolicRDIV,
+  GCDMIV,
+  BanerjeeMIV,
+};
+
+} // anonymous namespace
+
+static cl::opt<DependenceTestType> EnableDependenceTest(
+    "da-enable-dependence-test", cl::init(DependenceTestType::All),
+    cl::ReallyHidden,
+    cl::desc("Run only specified dependence test routine and disable others. "
+             "The purpose is mainly to exclude the influence of other "
+             "dependence test routines in regression tests. If set to All, all "
+             "dependence test routines are enabled."),
+    cl::values(clEnumValN(DependenceTestType::All, "all",
+                          "Enable all dependence test routines."),
+               clEnumValN(DependenceTestType::StrongSIV, "strong-siv",
+                          "Enable only Strong SIV test."),
+               clEnumValN(DependenceTestType::WeakCrossingSIV,
+                          "weak-crossing-siv",
+                          "Enable only Weak-Crossing SIV test."),
+               clEnumValN(DependenceTestType::ExactSIV, "exact-siv",
+                          "Enable only Exact SIV test."),
+               clEnumValN(DependenceTestType::WeakZeroSIV, "weak-zero-siv",
+                          "Enable only Weak-Zero SIV test."),
+               clEnumValN(DependenceTestType::ExactRDIV, "exact-rdiv",
+                          "Enable only Exact RDIV test."),
+               clEnumValN(DependenceTestType::SymbolicRDIV, "symbolic-rdiv",
+                          "Enable only Symbolic RDIV test."),
+               clEnumValN(DependenceTestType::GCDMIV, "gcd-miv",
+                          "Enable only GCD MIV test."),
+               clEnumValN(DependenceTestType::BanerjeeMIV, "banerjee-miv",
+                          "Enable only Banerjee MIV test.")));
+
+// TODO: This flag is disabled by default because it is still under development.
+// Enable it or delete this flag when the feature is ready.
+static cl::opt<bool> EnableMonotonicityCheck(
+    "da-enable-monotonicity-check", cl::init(false), cl::Hidden,
+    cl::desc("Check if the subscripts are monotonic. If it's not, dependence "
+             "is reported as unknown."));
+
+static cl::opt<bool> DumpMonotonicityReport(
+    "da-dump-monotonicity-report", cl::init(false), cl::Hidden,
+    cl::desc(
+        "When printing analysis, dump the results of monotonicity checks."));
 
 //===----------------------------------------------------------------------===//
 // basics
@@ -177,13 +227,196 @@ void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredTransitive<LoopInfoWrapperPass>();
 }
 
+namespace {
+
+/// The property of monotonicity of a SCEV. To define the monotonicity, assume
+/// a SCEV defined within N-nested loops. Let i_k denote the iteration number
+/// of the k-th loop. Then we can regard the SCEV as an N-ary function:
+///
+///   F(i_1, i_2, ..., i_N)
+///
+/// The domain of i_k is the closed range [0, BTC_k], where BTC_k is the
+/// backedge-taken count of the k-th loop.
+///
+/// A function F is said to be "monotonically increasing with respect to the
+/// k-th loop" if x <= y implies the following condition:
+///
+///   F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) <=
+///   F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// where i_1, ..., i_{k-1}, i_{k+1}, ..., i_N, x, and y are elements of their
+/// respective domains.
+///
+/// Likewise F is "monotonically decreasing with respect to the k-th loop"
+/// if x <= y implies
+///
+///   F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) >=
+///   F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// A function F that is monotonically increasing or decreasing with respect to
+/// the k-th loop is simply called "monotonic with respect to k-th loop".
+///
+/// A function F is said to be "multivariate monotonic" when it is monotonic
+/// with respect to all of the N loops.
+///
+/// Since integer comparison can be either signed or unsigned, we need to
+/// distinguish monotonicity in the signed sense from that in the unsigned
+/// sense. Note that the inequality "x <= y" merely indicates loop progression
+/// and is not affected by the difference between signed and unsigned order.
+///
+/// Currently we only consider monotonicity in a signed sense.
+enum class SCEVMonotonicityType {
+  /// We don't know anything about the monotonicity of the SCEV.
+  Unknown,
+
+  /// The SCEV is loop-invariant with respect to the outermost loop. In other
+  /// words, the function F corresponding to the SCEV is a constant function.
+  Invariant,
+
+  /// The function F corresponding to the SCEV is multivariate monotonic in a
+  /// signed sense. Note that the multivariate monotonic function may also be a
+  /// constant function. The order employed in the definition of monotonicity
+  /// is not strict order.
+  MultivariateSignedMonotonic,
+};
+
+struct SCEVMonotonicity {
+  SCEVMonotonicity(SCEVMonotonicityType Type,
+                   const SCEV *FailurePoint = nullptr);
+
+  SCEVMonotonicityType getType() const { return Type; }
+
+  const SCEV *getFailurePoint() const { return FailurePoint; }
+
+  bool isUnknown() const { return Type == SCEVMonotonicityType::Unknown; }
+
+  void print(raw_ostream &OS, unsigned Depth) const;
+
+private:
+  SCEVMonotonicityType Type;
+
+  /// The subexpression that caused Unknown. Mainly for debugging purpose.
+  const SCEV *FailurePoint;
+};
+
+/// Check the monotonicity of a SCEV. Since dependence tests (SIV, MIV, etc.)
+/// assume that subscript expressions are (multivariate) monotonic, we need to
+/// verify this property before applying those tests. Violating this assumption
+/// may cause them to produce incorrect results.
+struct SCEVMonotonicityChecker
+    : public SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity> {
+
+  SCEVMonotonicityChecker(ScalarEvolution *SE) : SE(SE) {}
+
+  /// Check the monotonicity of \p Expr. \p Expr must be integer type. If \p
+  /// OutermostLoop is not null, \p Expr must be defined in \p OutermostLoop or
+  /// one of its nested loops.
+  SCEVMonotonicity checkMonotonicity(const SCEV *Expr,
+                                     const Loop *OutermostLoop);
+
+private:
+  ScalarEvolution *SE;
+
+  /// The outermost loop that DA is analyzing.
+  const Loop *OutermostLoop;
+
+  /// A helper to classify \p Expr as either Invariant or Unknown.
+  SCEVMonotonicity invariantOrUnknown(const SCEV *Expr);
+
+  /// Return true if \p Expr is loop-invariant with respect to the outermost
+  /// loop.
+  bool isLoopInvariant(const SCEV *Expr) const;
+
+  /// A helper to create an Unknown SCEVMonotonicity.
+  SCEVMonotonicity createUnknown(const SCEV *FailurePoint) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Unknown, FailurePoint);
+  }
+
+  SCEVMonotonicity visitAddRecExpr(const SCEVAddRecExpr *Expr);
+
+  SCEVMonotonicity visitConstant(const SCEVConstant *) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  }
+  SCEVMonotonicity visitVScale(const SCEVVScale *) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  }
+
+  // TODO: Handle more cases.
+  SCEVMonotonicity visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitAddExpr(const SCEVAddExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitMulExpr(const SCEVMulExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUDivExpr(const SCEVUDivExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSMinExpr(const SCEVSMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUMinExpr(const SCEVUMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUnknown(const SCEVUnknown *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+
+  friend struct SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity>;
+};
+
+} // anonymous namespace
+
 // Used to test the dependence analyzer.
 // Looks through the function, noting instructions that may access memory.
 // Calls depends() on every possible pair and prints out the result.
 // Ignores all other instructions.
 static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
-                                  ScalarEvolution &SE, bool NormalizeResults) {
+                                  ScalarEvolution &SE, LoopInfo &LI,
+                                  bool NormalizeResults) {
   auto *F = DA->getFunction();
+
+  if (DumpMonotonicityReport) {
+    SCEVMonotonicityChecker Checker(&SE);
+    OS << "Monotonicity check:\n";
+    for (Instruction &Inst : instructions(F)) {
+      if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst))
+        continue;
+      Value *Ptr = getLoadStorePointerOperand(&Inst);
+      const Loop *L = LI.getLoopFor(Inst.getParent());
+      const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L);
+      const SCEV *AccessFn = SE.removePointerBase(PtrSCEV);
+      SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L);
+      OS.indent(2) << "Inst: " << Inst << "\n";
+      OS.indent(4) << "Expr: " << *AccessFn << "\n";
+      Mon.print(OS, 4);
+    }
+    OS << "\n";
+  }
+
   for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
        ++SrcI) {
     if (SrcI->mayReadOrWriteMemory()) {
@@ -235,7 +468,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
                                           const Module *) const {
   dumpExampleDependence(
-      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(),
+      getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), false);
 }
 
 PreservedAnalyses
@@ -244,7 +478,7 @@ DependenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) {
      << "':\n";
   dumpExampleDependence(OS, &FAM.getResult<DependenceAnalysis>(F),
                         FAM.getResult<ScalarEvolutionAnalysis>(F),
-                        NormalizeResults);
+                        FAM.getResult<LoopAnalysis>(F), NormalizeResults);
   return PreservedAnalyses::all();
 }
 
@@ -671,6 +905,81 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
 }
 
 //===----------------------------------------------------------------------===//
+// SCEVMonotonicity
+
+SCEVMonotonicity::SCEVMonotonicity(SCEVMonotonicityType Type,
+                                   const SCEV *FailurePoint)
+    : Type(Type), FailurePoint(FailurePoint) {
+  assert(
+      ((Type == SCEVMonotonicityType::Unknown) == (FailurePoint != nullptr)) &&
+      "FailurePoint must be provided iff Type is Unknown");
+}
+
+void SCEVMonotonicity::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << "Monotonicity: ";
+  switch (Type) {
+  case SCEVMonotonicityType::Unknown:
+    assert(FailurePoint && "FailurePoint must be provided for Unknown");
+    OS << "Unknown\n";
+    OS.indent(Depth) << "Reason: " << *FailurePoint << "\n";
+    break;
+  case SCEVMonotonicityType::Invariant:
+    OS << "Invariant\n";
+    break;
+  case SCEVMonotonicityType::MultivariateSignedMonotonic:
+    OS << "MultivariateSignedMonotonic\n";
+    break;
+  }
+}
+
+bool SCEVMonotonicityChecker::isLoopInvariant(const SCEV *Expr) const {
+  return !OutermostLoop || SE->isLoopInvariant(Expr, OutermostLoop);
+}
+
+SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) {
+  if (isLoopInvariant(Expr))
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  return createUnknown(Expr);
+}
+
+SCEVMonotonicity
+SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr,
+                                           const Loop *OutermostLoop) {
+  assert(Expr->getType()->isIntegerTy() && "Expr must be integer type");
+  this->OutermostLoop = OutermostLoop;
+  return visit(Expr);
+}
+
+/// We only care about an affine AddRec at the moment. For an affine AddRec,
+/// the monotonicity can be inferred from its nowrap property. For example, let
+/// X and Y be loop-invariant, and assume Y is non-negative. An AddRec
+/// {X,+.Y}<nsw> implies:
+///
+///   X <=s (X + Y) <=s ((X + Y) + Y) <=s ...
+///
+/// Thus, we can conclude that the AddRec is monotonically increasing with
+/// respect to the associated loop in a signed sense. The similar reasoning
+/// applies when Y is non-positive, leading to a monotonically decreasing
+/// AddRec.
+SCEVMonotonicity
+SCEVMonotonicityChecker::visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+  if (!Expr->isAffine() || !Expr->hasNoSignedWrap())
+    return createUnknown(Expr);
+
+  const SCEV *Start = Expr->getStart();
+  const SCEV *Step = Expr->getStepRecurrence(*SE);
+
+  SCEVMonotonicity StartMon = visit(Start);
+  if (StartMon.isUnknown())
+    return StartMon;
+
+  if (!isLoopInvariant(Step))
+    return createUnknown(Expr);
+
+  return SCEVMonotonicity(SCEVMonotonicityType::MultivariateSignedMonotonic);
+}
+
+//===----------------------------------------------------------------------===//
 // DependenceInfo methods
 
 // For debugging purposes. Dumps a dependence to OS.
@@ -1273,6 +1582,13 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns true iff \p Test is enabled.
+static bool isDependenceTestEnabled(DependenceTestType Test) {
+  if (EnableDependenceTest == DependenceTestType::All)
+    return true;
+  return EnableDependenceTest == Test;
+}
+
 // testZIV -
 // When we have a pair of subscripts of the form [c1] and [c2],
 // where c1 and c2 are both loop invariant, we attack it using
@@ -1334,6 +1650,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
                                    const Loop *CurDstLoop, unsigned Level,
                                    FullDependence &Result,
                                    Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::StrongSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tStrong SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
   LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
@@ -1468,6 +1787,9 @@ bool DependenceInfo::weakCrossingSIVtest(
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint,
     const SCEV *&SplitIter) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakCrossingSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
   LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
@@ -1726,6 +2048,9 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                   const Loop *CurDstLoop, unsigned Level,
                                   FullDependence &Result,
                                   Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::ExactSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1905,6 +2230,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
     const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst,
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+    return false;
+
   // For the WeakSIV test, it's possible the loop isn't common to
   // the Src and Dst loops. If it isn't, then there's no need to
   // record a direction.
@@ -2013,6 +2341,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
     const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst,
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+    return false;
+
   // For the WeakSIV test, it's possible the loop isn't common to the
   // Src and Dst loops. If it isn't, then there's no need to record a direction.
   LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
@@ -2096,8 +2427,9 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                    const SCEV *SrcConst, const SCEV *DstConst,
                                    const Loop *SrcLoop, const Loop *DstLoop,
                                    FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::ExactRDIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "\tExact RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -2242,8 +2574,9 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
                                       const SCEV *C1, const SCEV *C2,
                                       const Loop *Loop1,
                                       const Loop *Loop2) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::SymbolicRDIV))
     return false;
+
   ++SymbolicRDIVapplications;
   LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    A1 = " << *A1);
@@ -2557,8 +2890,9 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
 // to "a common divisor".
 bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
                                 FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::GCDMIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
@@ -2725,8 +3059,9 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
 bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
                                      const SmallBitVector &Loops,
                                      FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::BanerjeeMIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "starting Banerjee\n");
   ++BanerjeeApplications;
   LLVM_DEBUG(dbgs() << "    Src = " << *Src << '\n');
@@ -3488,10 +3823,19 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   // resize Pair to contain as many pairs of subscripts as the delinearization
   // has found, and then initialize the pairs following the delinearization.
   Pair.resize(Size);
+  SCEVMonotonicityChecker MonChecker(SE);
+  const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
   for (int I = 0; I < Size; ++I) {
     Pair[I].Src = SrcSubscripts[I];
     Pair[I].Dst = DstSubscripts[I];
     unifySubscriptType(&Pair[I]);
+
+    if (EnableMonotonicityCheck) {
+      if (MonChecker.checkMonotonicity(Pair[I].Src, OutermostLoop).isUnknown())
+        return false;
+      if (MonChecker.checkMonotonicity(Pair[I].Dst, OutermostLoop).isUnknown())
+        return false;
+    }
   }
 
   return true;
@@ -3824,6 +4168,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   Pair[0].Src = SrcEv;
   Pair[0].Dst = DstEv;
 
+  SCEVMonotonicityChecker MonChecker(SE);
+  const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
+  if (EnableMonotonicityCheck)
+    if (MonChecker.checkMonotonicity(Pair[0].Src, OutermostLoop).isUnknown() ||
+        MonChecker.checkMonotonicity(Pair[0].Dst, OutermostLoop).isUnknown())
+      return std::make_unique<Dependence>(Src, Dst,
+                                          SCEVUnionPredicate(Assume, *SE));
+
   if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
       LLVM_DEBUG(dbgs() << "    delinearized\n");
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index dc813f6..b573023 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4866,6 +4866,89 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   return nullptr;
 }
 
+/// Look for the following pattern and simplify %to_fold to %identicalPhi.
+/// Here %phi, %to_fold and %phi.next perform the same functionality as
+/// %identicalPhi and hence the select instruction %to_fold can be folded
+/// into %identicalPhi.
+///
+/// BB1:
+///   %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
+///   %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
+///   ...
+///   %identicalPhi.next = select %cmp, %val, %identicalPhi
+///                      (or select %cmp, %identicalPhi, %val)
+///   %to_fold = select %cmp2, %identicalPhi, %phi
+///   %phi.next = select %cmp, %val, %to_fold
+///             (or select %cmp, %to_fold, %val)
+///
+/// Prove that %phi and %identicalPhi are the same by induction:
+///
+/// Base case: Both %phi and %identicalPhi are equal on entry to the loop.
+/// Inductive case:
+/// Suppose %phi and %identicalPhi are equal at iteration i.
+/// We look at their values at iteration i+1 which are %phi.next and
+/// %identicalPhi.next. They would have become different only when %cmp is
+/// false and the corresponding values %to_fold and %identicalPhi differ
+/// (similar reason for the other "or" case in the bracket).
+///
+/// The only condition when %to_fold and %identicalPh could differ is when %cmp2
+/// is false and %to_fold is %phi, which contradicts our inductive hypothesis
+/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
+/// always equal at iteration i+1.
+bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) {
+  if (PN.getParent() != IdenticalPN.getParent())
+    return false;
+  if (PN.getNumIncomingValues() != 2)
+    return false;
+
+  // Check that only the backedge incoming value is different.
+  unsigned DiffVals = 0;
+  BasicBlock *DiffValBB = nullptr;
+  for (unsigned i = 0; i < 2; i++) {
+    BasicBlock *PredBB = PN.getIncomingBlock(i);
+    if (PN.getIncomingValueForBlock(PredBB) !=
+        IdenticalPN.getIncomingValueForBlock(PredBB)) {
+      DiffVals++;
+      DiffValBB = PredBB;
+    }
+  }
+  if (DiffVals != 1)
+    return false;
+  // Now check that the backedge incoming values are two select
+  // instructions with the same condition. Either their true
+  // values are the same, or their false values are the same.
+  auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB));
+  auto *IdenticalSI =
+      dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB));
+  if (!SI || !IdenticalSI)
+    return false;
+  if (SI->getCondition() != IdenticalSI->getCondition())
+    return false;
+
+  SelectInst *SIOtherVal = nullptr;
+  Value *IdenticalSIOtherVal = nullptr;
+  if (SI->getTrueValue() == IdenticalSI->getTrueValue()) {
+    SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue());
+    IdenticalSIOtherVal = IdenticalSI->getFalseValue();
+  } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) {
+    SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue());
+    IdenticalSIOtherVal = IdenticalSI->getTrueValue();
+  } else {
+    return false;
+  }
+
+  // Now check that the other values in select, i.e., %to_fold and
+  // %identicalPhi, are essentially the same value.
+  if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN)
+    return false;
+  if (!(SIOtherVal->getTrueValue() == &IdenticalPN &&
+        SIOtherVal->getFalseValue() == &PN) &&
+      !(SIOtherVal->getTrueValue() == &PN &&
+        SIOtherVal->getFalseValue() == &IdenticalPN))
+    return false;
+  return true;
+}
+
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -5041,7 +5124,14 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
   if (Imp)
     return *Imp ? TrueVal : FalseVal;
-
+  // Look for same PHIs in the true and false values.
+  if (auto *TruePHI = dyn_cast<PHINode>(TrueVal))
+    if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) {
+      if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI))
+        return FalseVal;
+      if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI))
+        return TrueVal;
+    }
   return nullptr;
 }
 
@@ -5106,32 +5196,33 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
         return Ptr;
 
       // The following transforms are only safe if the ptrtoint cast
-      // doesn't truncate the pointers.
-      if (Indices[0]->getType()->getScalarSizeInBits() ==
-          Q.DL.getPointerSizeInBits(AS)) {
+      // doesn't truncate the address of the pointers. The non-address bits
+      // must be the same, as the underlying objects are the same.
+      if (Indices[0]->getType()->getScalarSizeInBits() >=
+          Q.DL.getAddressSizeInBits(AS)) {
         auto CanSimplify = [GEPTy, &P, Ptr]() -> bool {
           return P->getType() == GEPTy &&
                  getUnderlyingObject(P) == getUnderlyingObject(Ptr);
         };
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
-            match(Indices[0],
-                  m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) &&
+            match(Indices[0], m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                    m_PtrToIntOrAddr(m_Specific(Ptr)))) &&
             CanSimplify())
           return P;
 
         // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of
         // size 1 << C.
-        if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)),
-                                           m_PtrToInt(m_Specific(Ptr))),
+        if (match(Indices[0], m_AShr(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                           m_PtrToIntOrAddr(m_Specific(Ptr))),
                                      m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C && CanSimplify())
           return P;
 
         // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of
         // size C.
-        if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)),
-                                           m_PtrToInt(m_Specific(Ptr))),
+        if (match(Indices[0], m_SDiv(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                           m_PtrToIntOrAddr(m_Specific(Ptr))),
                                      m_SpecificInt(TyAllocSize))) &&
             CanSimplify())
           return P;
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a60a4bb..6cdf51a 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -1100,12 +1100,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
   for (auto &GlobalList : Index) {
     // Ignore entries for references that are undefined in the current module.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
-    assert(GlobalList.second.SummaryList.size() == 1 &&
+    assert(GlobalList.second.getSummaryList().size() == 1 &&
            "Expected module's index to have one summary per GUID");
-    auto &Summary = GlobalList.second.SummaryList[0];
+    auto &Summary = GlobalList.second.getSummaryList()[0];
     if (!IsThinLTO) {
       Summary->setNotEligibleToImport();
       continue;
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 5e94e0b..5e92ca1 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -1136,7 +1136,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
     if (!AreStatisticsEnabled())
       return;
     for (auto &GVS : Index)
-      for (auto &GV : GVS.second.SummaryList)
+      for (auto &GV : GVS.second.getSummaryList())
         if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()))
           Stat += FS->paramAccesses().size();
   };
@@ -1147,7 +1147,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
 
   // Convert the ModuleSummaryIndex to a FunctionMap
   for (auto &GVS : Index) {
-    for (auto &GV : GVS.second.SummaryList) {
+    for (auto &GV : GVS.second.getSummaryList()) {
       FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get());
       if (!FS || FS->paramAccesses().empty())
         continue;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index cf63285..f71a534 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -451,6 +451,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
+  copyModuleAttrToFunctions(*M);
 
   if (!Slots)
     return false;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index aaee1f0..cf7efbfa 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7143,6 +7143,8 @@ Error BitcodeReader::materializeModule() {
 
   UpgradeARCRuntime(*TheModule);
 
+  copyModuleAttrToFunctions(*TheModule);
+
   return Error::success();
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 8ff3aa9..61aa7c2f5 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -235,7 +235,7 @@ public:
       return;
     for (const auto &GUIDSummaryLists : *Index)
       // Examine all summaries for this GUID.
-      for (auto &Summary : GUIDSummaryLists.second.SummaryList)
+      for (auto &Summary : GUIDSummaryLists.second.getSummaryList())
         if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) {
           // For each call in the function summary, see if the call
           // is to a GUID (which means it is for an indirect call,
@@ -587,7 +587,7 @@ public:
         }
     } else {
       for (auto &Summaries : Index)
-        for (auto &Summary : Summaries.second.SummaryList)
+        for (auto &Summary : Summaries.second.getSummaryList())
           Callback({Summaries.first, Summary.get()}, false);
     }
   }
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 72bb98c..64cbe9d 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -836,6 +836,7 @@ uint64_t DataRecordHandle::getDataSize() const {
   case DataSizeFlags::Uses8B:
     return support::endian::read64le(DataSizePtr);
   }
+  llvm_unreachable("Unknown DataSizeFlags enum");
 }
 
 void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
@@ -863,6 +864,7 @@ uint32_t DataRecordHandle::getNumRefs() const {
   case NumRefsFlags::Uses8B:
     return support::endian::read64le(NumRefsPtr);
   }
+  llvm_unreachable("Unknown NumRefsFlags enum");
 }
 
 void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
@@ -1270,6 +1272,7 @@ Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
       return FaultInResult.takeError();
     return true;
   }
+  llvm_unreachable("Unknown ObjectPresence enum");
 }
 
 Expected<OnDiskGraphDB::ObjectPresence>
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e2af0c5..fefde64f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1438,6 +1438,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
           BBFreqEnabled,
           BrProbEnabled,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
+          // Use static_cast to avoid breakage of tests on windows.
           static_cast<bool>(BBAddrMapSkipEmitBBEntries),
           HasCalls,
           false};
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index f0f0861..c7d45897 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -566,32 +566,54 @@ bool DwarfExpression::addExpression(
     case dwarf::DW_OP_LLVM_extract_bits_zext: {
       unsigned SizeInBits = Op->getArg(1);
       unsigned BitOffset = Op->getArg(0);
+      unsigned DerefSize = 0;
+      //  Operations are done in the DWARF "generic type" whose size
+      // is the size of a pointer.
+      unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
 
       // If we have a memory location then dereference to get the value, though
       // we have to make sure we don't dereference any bytes past the end of the
       // object.
       if (isMemoryLocation()) {
-        emitOp(dwarf::DW_OP_deref_size);
-        emitUnsigned(alignTo(BitOffset + SizeInBits, 8) / 8);
+        DerefSize = alignTo(BitOffset + SizeInBits, 8) / 8;
+        if (DerefSize == PtrSizeInBytes) {
+          emitOp(dwarf::DW_OP_deref);
+        } else {
+          emitOp(dwarf::DW_OP_deref_size);
+          emitUnsigned(DerefSize);
+        }
       }
 
-      // Extract the bits by a shift left (to shift out the bits after what we
-      // want to extract) followed by shift right (to shift the bits to position
-      // 0 and also sign/zero extend). These operations are done in the DWARF
-      // "generic type" whose size is the size of a pointer.
-      unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
-      unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
-      unsigned RightShift = LeftShift + BitOffset;
-      if (LeftShift) {
-        emitOp(dwarf::DW_OP_constu);
-        emitUnsigned(LeftShift);
-        emitOp(dwarf::DW_OP_shl);
-      }
-      if (RightShift) {
-        emitOp(dwarf::DW_OP_constu);
-        emitUnsigned(RightShift);
-        emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
-                                                            : dwarf::DW_OP_shr);
+      // If a dereference was emitted for an unsigned value, and
+      // there's no bit offset, then a bit of optimization is
+      // possible.
+      if (OpNum == dwarf::DW_OP_LLVM_extract_bits_zext && BitOffset == 0) {
+        if (8 * DerefSize == SizeInBits) {
+          // The correct value is already on the stack.
+        } else {
+          // No need to shift, we can just mask off the desired bits.
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned((1u << SizeInBits) - 1);
+          emitOp(dwarf::DW_OP_and);
+        }
+      } else {
+        // Extract the bits by a shift left (to shift out the bits after what we
+        // want to extract) followed by shift right (to shift the bits to
+        // position 0 and also sign/zero extend).
+        unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
+        unsigned RightShift = LeftShift + BitOffset;
+        if (LeftShift) {
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned(LeftShift);
+          emitOp(dwarf::DW_OP_shl);
+        }
+        if (RightShift) {
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned(RightShift);
+          emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext
+                     ? dwarf::DW_OP_shra
+                     : dwarf::DW_OP_shr);
+        }
       }
 
       // The value is now at the top of the stack, so set the location to
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index c438eae..9795a0b 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -98,6 +98,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineUniformityAnalysisPassPass(Registry);
   initializeMIR2VecVocabLegacyAnalysisPass(Registry);
   initializeMIR2VecVocabPrinterLegacyPassPass(Registry);
+  initializeMIR2VecPrinterLegacyPassPass(Registry);
   initializeMachineUniformityInfoPrinterPassPass(Registry);
   initializeMachineVerifierLegacyPassPass(Registry);
   initializeObjCARCContractLegacyPassPass(Registry);
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 0522698..c1365f4 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -64,7 +64,6 @@ CGOPT_EXP(uint64_t, LargeDataThreshold)
 CGOPT(ExceptionHandling, ExceptionModel)
 CGOPT_EXP(CodeGenFileType, FileType)
 CGOPT(FramePointerKind, FramePointerUsage)
-CGOPT(bool, EnableUnsafeFPMath)
 CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
@@ -219,12 +218,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                      "Enable frame pointer elimination")));
   CGBINDOPT(FramePointerUsage);
 
-  static cl::opt<bool> EnableUnsafeFPMath(
-      "enable-unsafe-fp-math",
-      cl::desc("Enable optimizations that may decrease FP precision"),
-      cl::init(false));
-  CGBINDOPT(EnableUnsafeFPMath);
-
   static cl::opt<bool> EnableNoInfsFPMath(
       "enable-no-infs-fp-math",
       cl::desc("Enable FP math optimizations that assume no +-Infs"),
@@ -552,7 +545,6 @@ TargetOptions
 codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   TargetOptions Options;
   Options.AllowFPOpFusion = getFuseFPOps();
-  Options.UnsafeFPMath = getEnableUnsafeFPMath();
   Options.NoInfsFPMath = getEnableNoInfsFPMath();
   Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
   Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
@@ -706,7 +698,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   if (getStackRealign())
     NewAttrs.addAttribute("stackrealign");
 
-  HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math");
   HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
   HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
   HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f80e1e8..3ac6d2a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -1498,7 +1498,7 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
   // Before stepping forward past MI, remember which regs were live
   // before MI. This is needed to set the Undef flag only when reg is
   // dead.
-  SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
+  SparseSet<MCPhysReg, MCPhysReg> LiveBeforeMI;
   LiveBeforeMI.setUniverse(TRI->getNumRegs());
   for (unsigned Reg : Redefs)
     LiveBeforeMI.insert(Reg);
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 5c78d98..99be1fc0 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Module.h"
@@ -29,20 +30,30 @@ using namespace mir2vec;
 STATISTIC(MIRVocabMissCounter,
           "Number of lookups to MIR entities not present in the vocabulary");
 
-cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
+namespace llvm {
+namespace mir2vec {
+cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
     VocabFile("mir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
               cl::cat(MIR2VecCategory));
-cl::opt<float>
-    llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
-                             cl::desc("Weight for machine opcode embeddings"),
-                             cl::cat(MIR2VecCategory));
+cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                         cl::desc("Weight for machine opcode embeddings"),
+                         cl::cat(MIR2VecCategory));
+cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
+    "mir2vec-kind", cl::Optional,
+    cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic",
+                          "Generate symbolic embeddings for MIR")),
+    cl::init(MIR2VecKind::Symbolic), cl::desc("MIR2Vec embedding kind"),
+    cl::cat(MIR2VecCategory));
+
+} // namespace mir2vec
+} // namespace llvm
 
 //===----------------------------------------------------------------------===//
-// Vocabulary Implementation
+// Vocabulary
 //===----------------------------------------------------------------------===//
 
 MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
@@ -188,6 +199,28 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() {
                     << " unique base opcodes\n");
 }
 
+Expected<MIRVocabulary>
+MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII,
+                                       unsigned Dim) {
+  assert(Dim > 0 && "Dimension must be greater than zero");
+
+  float DummyVal = 0.1f;
+
+  // Create dummy embeddings for all canonical opcode names
+  VocabMap DummyVocabMap;
+  for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) {
+    std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
+    if (DummyVocabMap.count(BaseOpcode) == 0) {
+      // Only add if not already present
+      DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal);
+      DummyVal += 0.1f;
+    }
+  }
+
+  // Create and return vocabulary with dummy embeddings
+  return MIRVocabulary::create(std::move(DummyVocabMap), TII);
+}
+
 //===----------------------------------------------------------------------===//
 // MIR2VecVocabLegacyAnalysis Implementation
 //===----------------------------------------------------------------------===//
@@ -258,7 +291,73 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
 }
 
 //===----------------------------------------------------------------------===//
-// Printer Passes Implementation
+// MIREmbedder and its subclasses
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<MIREmbedder> MIREmbedder::create(MIR2VecKind Mode,
+                                                 const MachineFunction &MF,
+                                                 const MIRVocabulary &Vocab) {
+  switch (Mode) {
+  case MIR2VecKind::Symbolic:
+    return std::make_unique<SymbolicMIREmbedder>(MF, Vocab);
+  }
+  return nullptr;
+}
+
+Embedding MIREmbedder::computeEmbeddings(const MachineBasicBlock &MBB) const {
+  Embedding MBBVector(Dimension, 0);
+
+  // Get instruction info for opcode name resolution
+  const auto &Subtarget = MF.getSubtarget();
+  const auto *TII = Subtarget.getInstrInfo();
+  if (!TII) {
+    MF.getFunction().getContext().emitError(
+        "MIR2Vec: No TargetInstrInfo available; cannot compute embeddings");
+    return MBBVector;
+  }
+
+  // Process each machine instruction in the basic block
+  for (const auto &MI : MBB) {
+    // Skip debug instructions and other metadata
+    if (MI.isDebugInstr())
+      continue;
+    MBBVector += computeEmbeddings(MI);
+  }
+
+  return MBBVector;
+}
+
+Embedding MIREmbedder::computeEmbeddings() const {
+  Embedding MFuncVector(Dimension, 0);
+
+  // Consider all reachable machine basic blocks in the function
+  for (const auto *MBB : depth_first(&MF))
+    MFuncVector += computeEmbeddings(*MBB);
+  return MFuncVector;
+}
+
+SymbolicMIREmbedder::SymbolicMIREmbedder(const MachineFunction &MF,
+                                         const MIRVocabulary &Vocab)
+    : MIREmbedder(MF, Vocab) {}
+
+std::unique_ptr<SymbolicMIREmbedder>
+SymbolicMIREmbedder::create(const MachineFunction &MF,
+                            const MIRVocabulary &Vocab) {
+  return std::make_unique<SymbolicMIREmbedder>(MF, Vocab);
+}
+
+Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const {
+  // Skip debug instructions and other metadata
+  if (MI.isDebugInstr())
+    return Embedding(Dimension, 0);
+
+  // Todo: Add operand/argument contributions
+
+  return Vocab[MI.getOpcode()];
+}
+
+//===----------------------------------------------------------------------===//
+// Printer Passes
 //===----------------------------------------------------------------------===//
 
 char MIR2VecVocabPrinterLegacyPass::ID = 0;
@@ -297,3 +396,56 @@ MachineFunctionPass *
 llvm::createMIR2VecVocabPrinterLegacyPass(raw_ostream &OS) {
   return new MIR2VecVocabPrinterLegacyPass(OS);
 }
+
+char MIR2VecPrinterLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(MIR2VecPrinterLegacyPass, "print-mir2vec",
+                      "MIR2Vec Embedder Printer Pass", false, true)
+INITIALIZE_PASS_DEPENDENCY(MIR2VecVocabLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(MIR2VecPrinterLegacyPass, "print-mir2vec",
+                    "MIR2Vec Embedder Printer Pass", false, true)
+
+bool MIR2VecPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) {
+  auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>();
+  auto VocabOrErr =
+      Analysis.getMIR2VecVocabulary(*MF.getFunction().getParent());
+  assert(VocabOrErr && "Failed to get MIR2Vec vocabulary");
+  auto &MIRVocab = *VocabOrErr;
+
+  auto Emb = mir2vec::MIREmbedder::create(MIR2VecEmbeddingKind, MF, MIRVocab);
+  if (!Emb) {
+    OS << "Error creating MIR2Vec embeddings for function " << MF.getName()
+       << "\n";
+    return false;
+  }
+
+  OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n";
+  OS << "Machine Function vector: ";
+  Emb->getMFunctionVector().print(OS);
+
+  OS << "Machine basic block vectors:\n";
+  for (const MachineBasicBlock &MBB : MF) {
+    OS << "Machine basic block: " << MBB.getFullName() << ":\n";
+    Emb->getMBBVector(MBB).print(OS);
+  }
+
+  OS << "Machine instruction vectors:\n";
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // Skip debug instructions as they are not
+      // embedded
+      if (MI.isDebugInstr())
+        continue;
+
+      OS << "Machine instruction: ";
+      MI.print(OS);
+      Emb->getMInstVector(MI).print(OS);
+    }
+  }
+
+  return false;
+}
+
+MachineFunctionPass *llvm::createMIR2VecPrinterLegacyPass(raw_ostream &OS) {
+  return new MIR2VecPrinterLegacyPass(OS);
+}
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index d988a2a..e37f784 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
@@ -35,13 +36,10 @@ using namespace sampleprofutil;
 
 // TODO(xur): Remove this option and related code once we make true as the
 // default.
-namespace llvm {
-cl::opt<bool> ImprovedFSDiscriminator(
+cl::opt<bool> llvm::ImprovedFSDiscriminator(
     "improved-fs-discriminator", cl::Hidden, cl::init(false),
     cl::desc("New FS discriminators encoding (incompatible with the original "
              "encoding)"));
-} // namespace llvm
-
 char MIRAddFSDiscriminators::ID = 0;
 
 INITIALIZE_PASS(MIRAddFSDiscriminators, DEBUG_TYPE,
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 9bba50e8..d44f577 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -62,9 +63,6 @@ static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden,
                                   cl::init(false),
                                   cl::desc("View BFI after MIR loader"));
 
-namespace llvm {
-extern cl::opt<bool> ImprovedFSDiscriminator;
-}
 char MIRProfileLoaderPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 7acddff..729e73c 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -932,12 +932,11 @@ void MachineLICMImpl::InitRegPressure(MachineBasicBlock *BB) {
 void MachineLICMImpl::UpdateRegPressure(const MachineInstr *MI,
                                         bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
-  for (const auto &RPIdAndCost : Cost) {
-    unsigned Class = RPIdAndCost.first;
-    if (static_cast<int>(RegPressure[Class]) < -RPIdAndCost.second)
+  for (const auto &[Class, Weight] : Cost) {
+    if (static_cast<int>(RegPressure[Class]) < -Weight)
       RegPressure[Class] = 0;
     else
-      RegPressure[Class] += RPIdAndCost.second;
+      RegPressure[Class] += Weight;
   }
 }
 
@@ -1215,11 +1214,10 @@ bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
 /// given cost matrix can cause high register pressure.
 bool MachineLICMImpl::CanCauseHighRegPressure(
     const SmallDenseMap<unsigned, int> &Cost, bool CheapInstr) {
-  for (const auto &RPIdAndCost : Cost) {
-    if (RPIdAndCost.second <= 0)
+  for (const auto &[Class, Weight] : Cost) {
+    if (Weight <= 0)
       continue;
 
-    unsigned Class = RPIdAndCost.first;
     int Limit = RegLimit[Class];
 
     // Don't hoist cheap instructions if they would increase register pressure,
@@ -1228,7 +1226,7 @@ bool MachineLICMImpl::CanCauseHighRegPressure(
       return true;
 
     for (const auto &RP : BackTrace)
-      if (static_cast<int>(RP[Class]) + RPIdAndCost.second >= Limit)
+      if (static_cast<int>(RP[Class]) + Weight >= Limit)
         return true;
   }
 
@@ -1246,8 +1244,8 @@ void MachineLICMImpl::UpdateBackTraceRegPressure(const MachineInstr *MI) {
 
   // Update register pressure of blocks from loop header to current block.
   for (auto &RP : BackTrace)
-    for (const auto &RPIdAndCost : Cost)
-      RP[RPIdAndCost.first] += RPIdAndCost.second;
+    for (const auto &[Class, Weight] : Cost)
+      RP[Class] += Weight;
 }
 
 /// Return true if it is potentially profitable to hoist the given loop
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 804480c..72b364c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -211,7 +211,7 @@ private:
     unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
   };
 
-  using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>;
+  using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>;
   /// This map contains entries for each virtual register that is currently
   /// available in a physical register.
   LiveRegMap LiveVirtRegs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bf9008..310d35d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16433,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (N0.getOperand(0) == N0.getOperand(1) ||
+         isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 437d0f4..bf1abfe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3765,6 +3765,8 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_TO_UINT:
   case ISD::LRINT:
   case ISD::LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
     Res = SoftPromoteHalfOp_Op0WithStrict(N);
     break;
   case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 88a4a8b..b1776ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -429,7 +429,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
-  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op2 = N->getOperand(2);
+  switch (TLI.getExtendForAtomicRMWArg(N->getOpcode())) {
+  case ISD::SIGN_EXTEND:
+    Op2 = SExtPromotedInteger(Op2);
+    break;
+  case ISD::ZERO_EXTEND:
+    Op2 = ZExtPromotedInteger(Op2);
+    break;
+  case ISD::ANY_EXTEND:
+    Op2 = GetPromotedInteger(Op2);
+    break;
+  default:
+    llvm_unreachable("Invalid atomic op extension");
+  }
   SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(),
                               N->getChain(), N->getBasePtr(),
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 5eb86e7..049efe8 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -51,7 +51,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
-  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  return HonorSignDependentRoundingFPMathOption;
 }
 
 /// NOTE: There are targets that still do not support the debug entry values
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 212a0c0..db5cc37 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -107,6 +107,28 @@ static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
   return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
 }
 
+static llvm::StringRef
+prettyLanguageVersionString(const DWARFAttribute &AttrValue,
+                            const DWARFDie &Die) {
+  if (AttrValue.Attr != DW_AT_language_version)
+    return {};
+
+  auto NameForm = Die.find(DW_AT_language_name);
+  if (!NameForm)
+    return {};
+
+  auto LName = NameForm->getAsUnsignedConstant();
+  if (!LName)
+    return {};
+
+  auto LVersion = AttrValue.Value.getAsUnsignedConstant();
+  if (!LVersion)
+    return {};
+
+  return llvm::dwarf::LanguageDescription(
+      static_cast<SourceLanguageName>(*LName), *LVersion);
+}
+
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -146,15 +168,28 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   } else if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
     Name = AttributeValueString(Attr, *Val);
 
-  if (!Name.empty())
-    WithColor(OS, Color) << Name;
-  else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
-           Attr == DW_AT_call_line || Attr == DW_AT_call_column ||
-           Attr == DW_AT_language_version) {
+  auto DumpUnsignedConstant = [&OS,
+                               &DumpOpts](const DWARFFormValue &FormValue) {
     if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
       OS << *Val;
     else
       FormValue.dump(OS, DumpOpts);
+  };
+
+  llvm::StringRef PrettyVersionName =
+      prettyLanguageVersionString(AttrValue, Die);
+  bool ShouldDumpRawLanguageVersion =
+      Attr == DW_AT_language_version &&
+      (DumpOpts.Verbose || PrettyVersionName.empty());
+
+  if (!Name.empty())
+    WithColor(OS, Color) << Name;
+  else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
+           Attr == DW_AT_call_line || Attr == DW_AT_call_column) {
+    DumpUnsignedConstant(FormValue);
+  } else if (Attr == DW_AT_language_version) {
+    if (ShouldDumpRawLanguageVersion)
+      DumpUnsignedConstant(FormValue);
   } else if (Attr == DW_AT_low_pc &&
              (FormValue.getAsAddress() ==
               dwarf::computeTombstoneAddress(U->getAddressByteSize()))) {
@@ -226,6 +261,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
       DumpOpts.RecoverableErrorHandler(createStringError(
           errc::invalid_argument, "decoding address ranges: %s",
           toString(RangesOrError.takeError()).c_str()));
+  } else if (Attr == DW_AT_language_version) {
+    if (!PrettyVersionName.empty())
+      WithColor(OS, Color) << (ShouldDumpRawLanguageVersion ? " " : "")
+                           << PrettyVersionName;
   }
 
   OS << ")\n";
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index f47b7ec..8d413a3 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1173,39 +1173,7 @@ void JITDylib::dump(raw_ostream &OS) {
          << " pending queries: { ";
       for (const auto &Q : KV.second.pendingQueries())
         OS << Q.get() << " (" << Q->getRequiredState() << ") ";
-      OS << "}\n      Defining EDU: ";
-      if (KV.second.DefiningEDU) {
-        OS << KV.second.DefiningEDU.get() << " { ";
-        for (auto &[Name, Flags] : KV.second.DefiningEDU->Symbols)
-          OS << Name << " ";
-        OS << "}\n";
-        OS << "        Dependencies:\n";
-        if (!KV.second.DefiningEDU->Dependencies.empty()) {
-          for (auto &[DepJD, Deps] : KV.second.DefiningEDU->Dependencies) {
-            OS << "          " << DepJD->getName() << ": [ ";
-            for (auto &Dep : Deps)
-              OS << Dep << " ";
-            OS << "]\n";
-          }
-        } else
-          OS << "          none\n";
-      } else
-        OS << "none\n";
-      OS << "      Dependant EDUs:\n";
-      if (!KV.second.DependantEDUs.empty()) {
-        for (auto &DependantEDU : KV.second.DependantEDUs) {
-          OS << "        " << DependantEDU << ": "
-             << DependantEDU->JD->getName() << " { ";
-          for (auto &[Name, Flags] : DependantEDU->Symbols)
-            OS << Name << " ";
-          OS << "}\n";
-        }
-      } else
-        OS << "        none\n";
-      assert((Symbols[KV.first].getState() != SymbolState::Ready ||
-              (KV.second.pendingQueries().empty() && !KV.second.DefiningEDU &&
-               !KV.second.DependantEDUs.empty())) &&
-             "Stale materializing info entry");
+      OS << "}\n";
     }
   });
 }
@@ -1967,9 +1935,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) {
   return runSessionLocked([&]() {
     bool AllOk = true;
 
-    // We'll collect these and verify them later to avoid redundant checks.
-    DenseSet<JITDylib::EmissionDepUnit *> EDUsToCheck;
-
     for (auto &JD : JDs) {
 
       auto LogFailure = [&]() -> raw_fd_ostream & {
@@ -2063,86 +2028,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) {
                              << " has stale or misordered queries.\n";
             }
           }
-
-          // If there's a DefiningEDU then check that...
-          // 1. The JD matches.
-          // 2. The symbol is in the EDU's Symbols map.
-          // 3. The symbol table entry is in the Emitted state.
-          if (MII.DefiningEDU) {
-
-            EDUsToCheck.insert(MII.DefiningEDU.get());
-
-            if (MII.DefiningEDU->JD != JD.get()) {
-              LogFailure() << "symbol " << Sym
-                           << " has DefiningEDU with incorrect JD"
-                           << (llvm::is_contained(JDs, MII.DefiningEDU->JD)
-                                   ? " (JD not currently in ExecutionSession"
-                                   : "")
-                           << "\n";
-            }
-
-            if (SymItr->second.getState() != SymbolState::Emitted) {
-              LogFailure()
-                  << "symbol " << Sym
-                  << " has DefiningEDU, but is not in Emitted state.\n";
-            }
-          }
-
-          // Check that JDs for any DependantEDUs are also in the session --
-          // that guarantees that we'll also visit them during this loop.
-          for (auto &DepEDU : MII.DependantEDUs) {
-            if (!llvm::is_contained(JDs, DepEDU->JD)) {
-              LogFailure() << "symbol " << Sym << " has DependantEDU "
-                           << (void *)DepEDU << " with JD (" << DepEDU->JD
-                           << ") that isn't in ExecutionSession.\n";
-            }
-          }
-        }
-      }
-    }
-
-    // Check EDUs.
-    for (auto *EDU : EDUsToCheck) {
-      assert(EDU->JD->State == JITDylib::Open && "EDU->JD is not Open");
-
-      auto LogFailure = [&]() -> raw_fd_ostream & {
-        AllOk = false;
-        auto &Stream = errs();
-        Stream << "In EDU defining " << EDU->JD->getName() << ": { ";
-        for (auto &[Sym, Flags] : EDU->Symbols)
-          Stream << Sym << " ";
-        Stream << "}, ";
-        return Stream;
-      };
-
-      if (EDU->Symbols.empty())
-        LogFailure() << "no symbols defined.\n";
-      else {
-        for (auto &[Sym, Flags] : EDU->Symbols) {
-          if (!Sym)
-            LogFailure() << "null symbol defined.\n";
-          else {
-            if (!EDU->JD->Symbols.count(SymbolStringPtr(Sym))) {
-              LogFailure() << "symbol " << Sym
-                           << " isn't present in JD's symbol table.\n";
-            }
-          }
-        }
-      }
-
-      for (auto &[DepJD, Symbols] : EDU->Dependencies) {
-        if (!llvm::is_contained(JDs, DepJD)) {
-          LogFailure() << "dependant symbols listed for JD that isn't in "
-                          "ExecutionSession.\n";
-        } else {
-          for (auto &DepSym : Symbols) {
-            if (!DepJD->Symbols.count(SymbolStringPtr(DepSym))) {
-              LogFailure()
-                  << "dependant symbol " << DepSym
-                  << " does not appear in symbol table for dependant JD "
-                  << DepJD->getName() << ".\n";
-            }
-          }
         }
       }
     }
@@ -2917,359 +2802,64 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
   return MR.JD.resolve(MR, Symbols);
 }
 
-template <typename HandleNewDepFn>
-void ExecutionSession::propagateExtraEmitDeps(
-    std::deque<JITDylib::EmissionDepUnit *> Worklist, EDUInfosMap &EDUInfos,
-    HandleNewDepFn HandleNewDep) {
-
-  // Iterate to a fixed-point to propagate extra-emit dependencies through the
-  // EDU graph.
-  while (!Worklist.empty()) {
-    auto &EDU = *Worklist.front();
-    Worklist.pop_front();
-
-    assert(EDUInfos.count(&EDU) && "No info entry for EDU");
-    auto &EDUInfo = EDUInfos[&EDU];
-
-    // Propagate new dependencies to users.
-    for (auto *UserEDU : EDUInfo.IntraEmitUsers) {
-
-      // UserEDUInfo only present if UserEDU has its own users.
-      JITDylib::EmissionDepUnitInfo *UserEDUInfo = nullptr;
-      {
-        auto UserEDUInfoItr = EDUInfos.find(UserEDU);
-        if (UserEDUInfoItr != EDUInfos.end())
-          UserEDUInfo = &UserEDUInfoItr->second;
-      }
-
-      for (auto &[DepJD, Deps] : EDUInfo.NewDeps) {
-        auto &UserEDUDepsForJD = UserEDU->Dependencies[DepJD];
-        DenseSet<NonOwningSymbolStringPtr> *UserEDUNewDepsForJD = nullptr;
-        for (auto Dep : Deps) {
-          if (UserEDUDepsForJD.insert(Dep).second) {
-            HandleNewDep(*UserEDU, *DepJD, Dep);
-            if (UserEDUInfo) {
-              if (!UserEDUNewDepsForJD) {
-                // If UserEDU has no new deps then it's not in the worklist
-                // yet, so add it.
-                if (UserEDUInfo->NewDeps.empty())
-                  Worklist.push_back(UserEDU);
-                UserEDUNewDepsForJD = &UserEDUInfo->NewDeps[DepJD];
-              }
-              // Add (DepJD, Dep) to NewDeps.
-              UserEDUNewDepsForJD->insert(Dep);
-            }
-          }
+WaitingOnGraph::ExternalState
+ExecutionSession::IL_getSymbolState(JITDylib *JD,
+                                    NonOwningSymbolStringPtr Name) {
+  if (JD->State != JITDylib::Open)
+    return WaitingOnGraph::ExternalState::Failed;
+
+  auto I = JD->Symbols.find_as(Name);
+
+  // FIXME: Can we eliminate this possibility if we support query binding?
+  if (I == JD->Symbols.end())
+    return WaitingOnGraph::ExternalState::Failed;
+
+  if (I->second.getFlags().hasError())
+    return WaitingOnGraph::ExternalState::Failed;
+
+  if (I->second.getState() == SymbolState::Ready)
+    return WaitingOnGraph::ExternalState::Ready;
+
+  return WaitingOnGraph::ExternalState::None;
+}
+
+template <typename UpdateSymbolFn, typename UpdateQueryFn>
+void ExecutionSession::IL_collectQueries(
+    JITDylib::AsynchronousSymbolQuerySet &Qs,
+    WaitingOnGraph::ContainerElementsMap &QualifiedSymbols,
+    UpdateSymbolFn &&UpdateSymbol, UpdateQueryFn &&UpdateQuery) {
+
+  for (auto &[JD, Symbols] : QualifiedSymbols) {
+    // IL_emit and JITDylib removal are synchronized by the session lock.
+    // Since JITDylib removal removes any contained nodes from the
+    // WaitingOnGraph, we should be able to assert that all nodes in the
+    // WaitingOnGraph have not been removed.
+    assert(JD->State == JITDylib::Open &&
+           "WaitingOnGraph includes definition in defunct JITDylib");
+    for (auto &Symbol : Symbols) {
+      // Update symbol table.
+      auto I = JD->Symbols.find_as(Symbol);
+      assert(I != JD->Symbols.end() &&
+             "Failed Symbol missing from JD symbol table");
+      auto &Entry = I->second;
+      UpdateSymbol(Entry);
+
+      // Collect queries.
+      auto J = JD->MaterializingInfos.find_as(Symbol);
+      if (J != JD->MaterializingInfos.end()) {
+        for (auto &Q : J->second.takeAllPendingQueries()) {
+          UpdateQuery(*Q, *JD, Symbol, Entry);
+          Qs.insert(std::move(Q));
         }
+        JD->MaterializingInfos.erase(J);
       }
     }
-
-    EDUInfo.NewDeps.clear();
-  }
-}
-
-// Note: This method modifies the emitted set.
-ExecutionSession::EDUInfosMap ExecutionSession::simplifyDepGroups(
-    MaterializationResponsibility &MR,
-    ArrayRef<SymbolDependenceGroup> EmittedDeps) {
-
-  auto &TargetJD = MR.getTargetJITDylib();
-
-  // 1. Build initial EmissionDepUnit -> EmissionDepUnitInfo and
-  //    Symbol -> EmissionDepUnit mappings.
-  DenseMap<JITDylib::EmissionDepUnit *, JITDylib::EmissionDepUnitInfo> EDUInfos;
-  EDUInfos.reserve(EmittedDeps.size());
-  DenseMap<NonOwningSymbolStringPtr, JITDylib::EmissionDepUnit *> EDUForSymbol;
-  for (auto &DG : EmittedDeps) {
-    assert(!DG.Symbols.empty() && "DepGroup does not cover any symbols");
-
-    // Skip empty EDUs.
-    if (DG.Dependencies.empty())
-      continue;
-
-    auto TmpEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD);
-    auto &EDUInfo = EDUInfos[TmpEDU.get()];
-    EDUInfo.EDU = std::move(TmpEDU);
-    for (const auto &Symbol : DG.Symbols) {
-      NonOwningSymbolStringPtr NonOwningSymbol(Symbol);
-      assert(!EDUForSymbol.count(NonOwningSymbol) &&
-             "Symbol should not appear in more than one SymbolDependenceGroup");
-      assert(MR.getSymbols().count(Symbol) &&
-             "Symbol in DepGroups not in the emitted set");
-      auto NewlyEmittedItr = MR.getSymbols().find(Symbol);
-      EDUInfo.EDU->Symbols[NonOwningSymbol] = NewlyEmittedItr->second;
-      EDUForSymbol[NonOwningSymbol] = EDUInfo.EDU.get();
-    }
-  }
-
-  // 2. Build a "residual" EDU to cover all symbols that have no dependencies.
-  {
-    DenseMap<NonOwningSymbolStringPtr, JITSymbolFlags> ResidualSymbolFlags;
-    for (auto &[Sym, Flags] : MR.getSymbols()) {
-      if (!EDUForSymbol.count(NonOwningSymbolStringPtr(Sym)))
-        ResidualSymbolFlags[NonOwningSymbolStringPtr(Sym)] = Flags;
-    }
-    if (!ResidualSymbolFlags.empty()) {
-      auto ResidualEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD);
-      ResidualEDU->Symbols = std::move(ResidualSymbolFlags);
-      auto &ResidualEDUInfo = EDUInfos[ResidualEDU.get()];
-      ResidualEDUInfo.EDU = std::move(ResidualEDU);
-
-      // If the residual EDU is the only one then bail out early.
-      if (EDUInfos.size() == 1)
-        return EDUInfos;
-
-      // Otherwise add the residual EDU to the EDUForSymbol map.
-      for (auto &[Sym, Flags] : ResidualEDUInfo.EDU->Symbols)
-        EDUForSymbol[Sym] = ResidualEDUInfo.EDU.get();
-    }
-  }
-
-#ifndef NDEBUG
-  assert(EDUForSymbol.size() == MR.getSymbols().size() &&
-         "MR symbols not fully covered by EDUs?");
-  for (auto &[Sym, Flags] : MR.getSymbols()) {
-    assert(EDUForSymbol.count(NonOwningSymbolStringPtr(Sym)) &&
-           "Sym in MR not covered by EDU");
-  }
-#endif // NDEBUG
-
-  // 3. Use the DepGroups array to build a graph of dependencies between
-  //    EmissionDepUnits in this finalization. We want to remove these
-  //    intra-finalization uses, propagating dependencies on symbols outside
-  //    this finalization. Add EDUs to the worklist.
-  for (auto &DG : EmittedDeps) {
-
-    // Skip SymbolDependenceGroups with no dependencies.
-    if (DG.Dependencies.empty())
-      continue;
-
-    assert(EDUForSymbol.count(NonOwningSymbolStringPtr(*DG.Symbols.begin())) &&
-           "No EDU for DG");
-    auto &EDU =
-        *EDUForSymbol.find(NonOwningSymbolStringPtr(*DG.Symbols.begin()))
-             ->second;
-
-    for (auto &[DepJD, Deps] : DG.Dependencies) {
-      DenseSet<NonOwningSymbolStringPtr> NewDepsForJD;
-
-      assert(!Deps.empty() && "Dependence set for DepJD is empty");
-
-      if (DepJD != &TargetJD) {
-        // DepJD is some other JITDylib.There can't be any intra-finalization
-        // edges here, so just skip.
-        for (auto &Dep : Deps)
-          NewDepsForJD.insert(NonOwningSymbolStringPtr(Dep));
-      } else {
-        // DepJD is the Target JITDylib. Check for intra-finaliztaion edges,
-        // skipping any and recording the intra-finalization use instead.
-        for (auto &Dep : Deps) {
-          NonOwningSymbolStringPtr NonOwningDep(Dep);
-          auto I = EDUForSymbol.find(NonOwningDep);
-          if (I == EDUForSymbol.end()) {
-            if (!MR.getSymbols().count(Dep))
-              NewDepsForJD.insert(NonOwningDep);
-            continue;
-          }
-
-          if (I->second != &EDU)
-            EDUInfos[I->second].IntraEmitUsers.insert(&EDU);
-        }
-      }
-
-      if (!NewDepsForJD.empty())
-        EDU.Dependencies[DepJD] = std::move(NewDepsForJD);
-    }
-  }
-
-  // 4. Build the worklist.
-  std::deque<JITDylib::EmissionDepUnit *> Worklist;
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    // If this EDU has extra-finalization dependencies and intra-finalization
-    // users then add it to the worklist.
-    if (!EDU->Dependencies.empty()) {
-      auto I = EDUInfos.find(EDU);
-      if (I != EDUInfos.end()) {
-        auto &EDUInfo = I->second;
-        if (!EDUInfo.IntraEmitUsers.empty()) {
-          EDUInfo.NewDeps = EDU->Dependencies;
-          Worklist.push_back(EDU);
-        }
-      }
-    }
-  }
-
-  // 4. Propagate dependencies through the EDU graph.
-  propagateExtraEmitDeps(
-      Worklist, EDUInfos,
-      [](JITDylib::EmissionDepUnit &, JITDylib &, NonOwningSymbolStringPtr) {});
-
-  return EDUInfos;
-}
-
-void ExecutionSession::IL_makeEDUReady(
-    std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-    JITDylib::AsynchronousSymbolQuerySet &Queries) {
-
-  // The symbols for this EDU are ready.
-  auto &JD = *EDU->JD;
-
-  for (auto &[Sym, Flags] : EDU->Symbols) {
-    assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-           "JD does not have an entry for Sym");
-    auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-
-    assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() &&
-             Entry.getState() == SymbolState::Materializing) ||
-            Entry.getState() == SymbolState::Resolved ||
-            Entry.getState() == SymbolState::Emitted) &&
-           "Emitting from state other than Resolved");
-
-    Entry.setState(SymbolState::Ready);
-
-    auto MII = JD.MaterializingInfos.find(SymbolStringPtr(Sym));
-
-    // Check for pending queries.
-    if (MII == JD.MaterializingInfos.end())
-      continue;
-    auto &MI = MII->second;
-
-    for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
-      Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol());
-      if (Q->isComplete())
-        Queries.insert(Q);
-      Q->removeQueryDependence(JD, SymbolStringPtr(Sym));
-    }
-
-    JD.MaterializingInfos.erase(MII);
-  }
-
-  JD.shrinkMaterializationInfoMemory();
-}
-
-void ExecutionSession::IL_makeEDUEmitted(
-    std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-    JITDylib::AsynchronousSymbolQuerySet &Queries) {
-
-  // The symbols for this EDU are emitted, but not ready.
-  auto &JD = *EDU->JD;
-
-  for (auto &[Sym, Flags] : EDU->Symbols) {
-    assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-           "JD does not have an entry for Sym");
-    auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-
-    assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() &&
-             Entry.getState() == SymbolState::Materializing) ||
-            Entry.getState() == SymbolState::Resolved ||
-            Entry.getState() == SymbolState::Emitted) &&
-           "Emitting from state other than Resolved");
-
-    if (Entry.getState() == SymbolState::Emitted) {
-      // This was already emitted, so we can skip the rest of this loop.
-#ifndef NDEBUG
-      for (auto &[Sym, Flags] : EDU->Symbols) {
-        assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-               "JD does not have an entry for Sym");
-        auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-        assert(Entry.getState() == SymbolState::Emitted &&
-               "Symbols for EDU in inconsistent state");
-        assert(JD.MaterializingInfos.count(SymbolStringPtr(Sym)) &&
-               "Emitted symbol has no MI");
-        auto MI = JD.MaterializingInfos[SymbolStringPtr(Sym)];
-        assert(MI.takeQueriesMeeting(SymbolState::Emitted).empty() &&
-               "Already-emitted symbol has waiting-on-emitted queries");
-      }
-#endif // NDEBUG
-      break;
-    }
-
-    Entry.setState(SymbolState::Emitted);
-    auto &MI = JD.MaterializingInfos[SymbolStringPtr(Sym)];
-    MI.DefiningEDU = EDU;
-
-    for (auto &Q : MI.takeQueriesMeeting(SymbolState::Emitted)) {
-      Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol());
-      if (Q->isComplete())
-        Queries.insert(Q);
-    }
   }
-
-  for (auto &[DepJD, Deps] : EDU->Dependencies) {
-    for (auto &Dep : Deps)
-      DepJD->MaterializingInfos[SymbolStringPtr(Dep)].DependantEDUs.insert(
-          EDU.get());
-  }
-}
-
-/// Removes the given dependence from EDU. If EDU's dependence set becomes
-/// empty then this function adds an entry for it to the EDUInfos map.
-/// Returns true if a new EDUInfosMap entry is added.
-bool ExecutionSession::IL_removeEDUDependence(JITDylib::EmissionDepUnit &EDU,
-                                              JITDylib &DepJD,
-                                              NonOwningSymbolStringPtr DepSym,
-                                              EDUInfosMap &EDUInfos) {
-  assert(EDU.Dependencies.count(&DepJD) &&
-         "JD does not appear in Dependencies of DependantEDU");
-  assert(EDU.Dependencies[&DepJD].count(DepSym) &&
-         "Symbol does not appear in Dependencies of DependantEDU");
-  auto &JDDeps = EDU.Dependencies[&DepJD];
-  JDDeps.erase(DepSym);
-  if (JDDeps.empty()) {
-    EDU.Dependencies.erase(&DepJD);
-    if (EDU.Dependencies.empty()) {
-      // If the dependencies set has become empty then EDU _may_ be ready
-      // (we won't know for sure until we've propagated the extra-emit deps).
-      // Create an EDUInfo for it (if it doesn't have one already) so that
-      // it'll be visited after propagation.
-      auto &DepEDUInfo = EDUInfos[&EDU];
-      if (!DepEDUInfo.EDU) {
-        assert(EDU.JD->Symbols.count(
-                   SymbolStringPtr(EDU.Symbols.begin()->first)) &&
-               "Missing symbol entry for first symbol in EDU");
-        auto DepEDUFirstMI = EDU.JD->MaterializingInfos.find(
-            SymbolStringPtr(EDU.Symbols.begin()->first));
-        assert(DepEDUFirstMI != EDU.JD->MaterializingInfos.end() &&
-               "Missing MI for first symbol in DependantEDU");
-        DepEDUInfo.EDU = DepEDUFirstMI->second.DefiningEDU;
-        return true;
-      }
-    }
-  }
-  return false;
 }
 
-Error ExecutionSession::makeJDClosedError(JITDylib::EmissionDepUnit &EDU,
-                                          JITDylib &ClosedJD) {
-  SymbolNameSet FailedSymbols;
-  for (auto &[Sym, Flags] : EDU.Symbols)
-    FailedSymbols.insert(SymbolStringPtr(Sym));
-  SymbolDependenceMap BadDeps;
-  for (auto &Dep : EDU.Dependencies[&ClosedJD])
-    BadDeps[&ClosedJD].insert(SymbolStringPtr(Dep));
-  return make_error<UnsatisfiedSymbolDependencies>(
-      ClosedJD.getExecutionSession().getSymbolStringPool(), EDU.JD,
-      std::move(FailedSymbols), std::move(BadDeps),
-      ClosedJD.getName() + " is closed");
-}
-
-Error ExecutionSession::makeUnsatisfiedDepsError(JITDylib::EmissionDepUnit &EDU,
-                                                 JITDylib &BadJD,
-                                                 SymbolNameSet BadDeps) {
-  SymbolNameSet FailedSymbols;
-  for (auto &[Sym, Flags] : EDU.Symbols)
-    FailedSymbols.insert(SymbolStringPtr(Sym));
-  SymbolDependenceMap BadDepsMap;
-  BadDepsMap[&BadJD] = std::move(BadDeps);
-  return make_error<UnsatisfiedSymbolDependencies>(
-      BadJD.getExecutionSession().getSymbolStringPool(), &BadJD,
-      std::move(FailedSymbols), std::move(BadDepsMap),
-      "dependencies removed or in error state");
-}
-
-Expected<JITDylib::AsynchronousSymbolQuerySet>
+Expected<ExecutionSession::EmitQueries>
 ExecutionSession::IL_emit(MaterializationResponsibility &MR,
-                          EDUInfosMap EDUInfos) {
+                          WaitingOnGraph::SimplifyResult SR) {
 
   if (MR.RT->isDefunct())
     return make_error<ResourceTrackerDefunct>(MR.RT);
@@ -3279,169 +2869,50 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR,
     return make_error<StringError>("JITDylib " + TargetJD.getName() +
                                        " is defunct",
                                    inconvertibleErrorCode());
+
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("entering ExecutionSession::IL_emit");
 #endif
 
-  // Walk all EDUs:
-  // 1. Verifying that dependencies are available (not removed or in the error
-  //    state.
-  // 2. Removing any dependencies that are already Ready.
-  // 3. Lifting any EDUs for Emitted symbols into the EDUInfos map.
-  // 4. Finding any dependant EDUs and lifting them into the EDUInfos map.
-  std::deque<JITDylib::EmissionDepUnit *> Worklist;
-  for (auto &[EDU, _] : EDUInfos)
-    Worklist.push_back(EDU);
-
-  for (auto *EDU : Worklist) {
-    auto *EDUInfo = &EDUInfos[EDU];
-
-    SmallVector<JITDylib *> DepJDsToRemove;
-    for (auto &[DepJD, Deps] : EDU->Dependencies) {
-      if (DepJD->State != JITDylib::Open)
-        return makeJDClosedError(*EDU, *DepJD);
-
-      SymbolNameSet BadDeps;
-      SmallVector<NonOwningSymbolStringPtr> DepsToRemove;
-      for (auto &Dep : Deps) {
-        auto DepEntryItr = DepJD->Symbols.find(SymbolStringPtr(Dep));
-
-        // If this dep has been removed or moved to the error state then add it
-        // to the bad deps set. We aggregate these bad deps for more
-        // comprehensive error messages.
-        if (DepEntryItr == DepJD->Symbols.end() ||
-            DepEntryItr->second.getFlags().hasError()) {
-          BadDeps.insert(SymbolStringPtr(Dep));
-          continue;
-        }
-
-        // If this dep isn't emitted yet then just add it to the NewDeps set to
-        // be propagated.
-        auto &DepEntry = DepEntryItr->second;
-        if (DepEntry.getState() < SymbolState::Emitted) {
-          EDUInfo->NewDeps[DepJD].insert(Dep);
-          continue;
-        }
-
-        // This dep has been emitted, so add it to the list to be removed from
-        // EDU.
-        DepsToRemove.push_back(Dep);
-
-        // If Dep is Ready then there's nothing further to do.
-        if (DepEntry.getState() == SymbolState::Ready) {
-          assert(!DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) &&
-                 "Unexpected MaterializationInfo attached to ready symbol");
-          continue;
-        }
+  auto ER = G.emit(std::move(SR),
+                   [this](JITDylib *JD, NonOwningSymbolStringPtr Name) {
+                     return IL_getSymbolState(JD, Name);
+                   });
 
-        // If we get here then Dep is Emitted. We need to look up its defining
-        // EDU and add this EDU to the defining EDU's list of users (this means
-        // creating an EDUInfos entry if the defining EDU doesn't have one
-        // already).
-        assert(DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) &&
-               "Expected MaterializationInfo for emitted dependency");
-        auto &DepMI = DepJD->MaterializingInfos[SymbolStringPtr(Dep)];
-        assert(DepMI.DefiningEDU &&
-               "Emitted symbol does not have a defining EDU");
-        assert(DepMI.DependantEDUs.empty() &&
-               "Already-emitted symbol has dependant EDUs?");
-        auto &DepEDUInfo = EDUInfos[DepMI.DefiningEDU.get()];
-        if (!DepEDUInfo.EDU) {
-          // No EDUInfo yet -- build initial entry, and reset the EDUInfo
-          // pointer, which we will have invalidated.
-          EDUInfo = &EDUInfos[EDU];
-          DepEDUInfo.EDU = DepMI.DefiningEDU;
-          for (auto &[DepDepJD, DepDeps] : DepEDUInfo.EDU->Dependencies) {
-            if (DepDepJD == &TargetJD) {
-              for (auto &DepDep : DepDeps)
-                if (!MR.getSymbols().count(SymbolStringPtr(DepDep)))
-                  DepEDUInfo.NewDeps[DepDepJD].insert(DepDep);
-            } else
-              DepEDUInfo.NewDeps[DepDepJD] = DepDeps;
-          }
-        }
-        DepEDUInfo.IntraEmitUsers.insert(EDU);
-      }
-
-      // Some dependencies were removed or in an error state -- error out.
-      if (!BadDeps.empty())
-        return makeUnsatisfiedDepsError(*EDU, *DepJD, std::move(BadDeps));
-
-      // Remove the emitted / ready deps from DepJD.
-      for (auto &Dep : DepsToRemove)
-        Deps.erase(Dep);
-
-      // If there are no further deps in DepJD then flag it for removal too.
-      if (Deps.empty())
-        DepJDsToRemove.push_back(DepJD);
-    }
+  EmitQueries EQ;
 
-    // Remove any JDs whose dependence sets have become empty.
-    for (auto &DepJD : DepJDsToRemove) {
-      assert(EDU->Dependencies.count(DepJD) &&
-             "Trying to remove non-existent dep entries");
-      EDU->Dependencies.erase(DepJD);
-    }
-
-    // Now look for users of this EDU.
-    for (auto &[Sym, Flags] : EDU->Symbols) {
-      assert(TargetJD.Symbols.count(SymbolStringPtr(Sym)) &&
-             "Sym not present in symbol table");
-      assert((TargetJD.Symbols[SymbolStringPtr(Sym)].getState() ==
-                  SymbolState::Resolved ||
-              TargetJD.Symbols[SymbolStringPtr(Sym)]
-                  .getFlags()
-                  .hasMaterializationSideEffectsOnly()) &&
-             "Emitting symbol not in the resolved state");
-      assert(!TargetJD.Symbols[SymbolStringPtr(Sym)].getFlags().hasError() &&
-             "Symbol is already in an error state");
-
-      auto MII = TargetJD.MaterializingInfos.find(SymbolStringPtr(Sym));
-      if (MII == TargetJD.MaterializingInfos.end() ||
-          MII->second.DependantEDUs.empty())
-        continue;
-
-      for (auto &DependantEDU : MII->second.DependantEDUs) {
-        if (IL_removeEDUDependence(*DependantEDU, TargetJD, Sym, EDUInfos))
-          EDUInfo = &EDUInfos[EDU];
-        EDUInfo->IntraEmitUsers.insert(DependantEDU);
-      }
-      MII->second.DependantEDUs.clear();
-    }
-  }
-
-  Worklist.clear();
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    if (!EDUInfo.IntraEmitUsers.empty() && !EDU->Dependencies.empty()) {
-      if (EDUInfo.NewDeps.empty())
-        EDUInfo.NewDeps = EDU->Dependencies;
-      Worklist.push_back(EDU);
-    }
-  }
-
-  propagateExtraEmitDeps(
-      Worklist, EDUInfos,
-      [](JITDylib::EmissionDepUnit &EDU, JITDylib &JD,
-         NonOwningSymbolStringPtr Sym) {
-        JD.MaterializingInfos[SymbolStringPtr(Sym)].DependantEDUs.insert(&EDU);
-      });
+  // Handle failed queries.
+  for (auto &SN : ER.Failed)
+    IL_collectQueries(
+        EQ.Failed, SN->defs(),
+        [](JITDylib::SymbolTableEntry &E) {
+          E.setFlags(E.getFlags() = JITSymbolFlags::HasError);
+        },
+        [&](AsynchronousSymbolQuery &Q, JITDylib &JD,
+            NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) {
+          auto &FS = EQ.FailedSymsForQuery[&Q];
+          if (!FS)
+            FS = std::make_shared<SymbolDependenceMap>();
+          (*FS)[&JD].insert(SymbolStringPtr(Name));
+        });
 
-  JITDylib::AsynchronousSymbolQuerySet CompletedQueries;
+  for (auto &FQ : EQ.Failed)
+    FQ->detach();
 
-  // Extract completed queries and lodge not-yet-ready EDUs in the
-  // session.
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    if (EDU->Dependencies.empty())
-      IL_makeEDUReady(std::move(EDUInfo.EDU), CompletedQueries);
-    else
-      IL_makeEDUEmitted(std::move(EDUInfo.EDU), CompletedQueries);
-  }
+  for (auto &SN : ER.Ready)
+    IL_collectQueries(
+        EQ.Updated, SN->defs(),
+        [](JITDylib::SymbolTableEntry &E) { E.setState(SymbolState::Ready); },
+        [](AsynchronousSymbolQuery &Q, JITDylib &JD,
+           NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) {
+          Q.notifySymbolMetRequiredState(SymbolStringPtr(Name), E.getSymbol());
+        });
 
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("exiting ExecutionSession::IL_emit");
 #endif
 
-  return std::move(CompletedQueries);
+  return std::move(EQ);
 }
 
 Error ExecutionSession::OL_notifyEmitted(
@@ -3471,40 +2942,127 @@ Error ExecutionSession::OL_notifyEmitted(
   }
 #endif // NDEBUG
 
-  auto EDUInfos = simplifyDepGroups(MR, DepGroups);
+  std::vector<std::unique_ptr<WaitingOnGraph::SuperNode>> SNs;
+  WaitingOnGraph::ContainerElementsMap Residual;
+  {
+    auto &JDResidual = Residual[&MR.getTargetJITDylib()];
+    for (auto &[Name, Flags] : MR.getSymbols())
+      JDResidual.insert(NonOwningSymbolStringPtr(Name));
+
+    for (auto &SDG : DepGroups) {
+      WaitingOnGraph::ContainerElementsMap Defs;
+      assert(!SDG.Symbols.empty());
+      auto &JDDefs = Defs[&MR.getTargetJITDylib()];
+      for (auto &Def : SDG.Symbols) {
+        JDDefs.insert(NonOwningSymbolStringPtr(Def));
+        JDResidual.erase(NonOwningSymbolStringPtr(Def));
+      }
+      WaitingOnGraph::ContainerElementsMap Deps;
+      if (!SDG.Dependencies.empty()) {
+        for (auto &[JD, Syms] : SDG.Dependencies) {
+          auto &JDDeps = Deps[JD];
+          for (auto &Dep : Syms)
+            JDDeps.insert(NonOwningSymbolStringPtr(Dep));
+        }
+      }
+      SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>(
+          std::move(Defs), std::move(Deps)));
+    }
+    if (!JDResidual.empty())
+      SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>(
+          std::move(Residual), WaitingOnGraph::ContainerElementsMap()));
+  }
+
+  auto SR = WaitingOnGraph::simplify(std::move(SNs));
 
   LLVM_DEBUG({
     dbgs() << "  Simplified dependencies:\n";
-    for (auto &[EDU, EDUInfo] : EDUInfos) {
-      dbgs() << "    Symbols: { ";
-      for (auto &[Sym, Flags] : EDU->Symbols)
-        dbgs() << Sym << " ";
-      dbgs() << "}, Dependencies: { ";
-      for (auto &[DepJD, Deps] : EDU->Dependencies) {
-        dbgs() << "(" << DepJD->getName() << ", { ";
-        for (auto &Dep : Deps)
-          dbgs() << Dep << " ";
-        dbgs() << "}) ";
+    for (auto &SN : SR.superNodes()) {
+
+      auto SortedLibs = [](WaitingOnGraph::ContainerElementsMap &C) {
+        std::vector<JITDylib *> JDs;
+        for (auto &[JD, _] : C)
+          JDs.push_back(JD);
+        llvm::sort(JDs, [](const JITDylib *LHS, const JITDylib *RHS) {
+          return LHS->getName() < RHS->getName();
+        });
+        return JDs;
+      };
+
+      auto SortedNames = [](WaitingOnGraph::ElementSet &Elems) {
+        std::vector<NonOwningSymbolStringPtr> Names(Elems.begin(), Elems.end());
+        llvm::sort(Names, [](const NonOwningSymbolStringPtr &LHS,
+                             const NonOwningSymbolStringPtr &RHS) {
+          return *LHS < *RHS;
+        });
+        return Names;
+      };
+
+      dbgs() << "    Defs: {";
+      for (auto *JD : SortedLibs(SN->defs())) {
+        dbgs() << " (" << JD->getName() << ", [";
+        for (auto &Sym : SortedNames(SN->defs()[JD]))
+          dbgs() << " " << Sym;
+        dbgs() << " ])";
+      }
+      dbgs() << " }, Deps: {";
+      for (auto *JD : SortedLibs(SN->deps())) {
+        dbgs() << " (" << JD->getName() << ", [";
+        for (auto &Sym : SortedNames(SN->deps()[JD]))
+          dbgs() << " " << Sym;
+        dbgs() << " ])";
       }
-      dbgs() << "}\n";
+      dbgs() << " }\n";
     }
   });
-
-  auto CompletedQueries =
-      runSessionLocked([&]() { return IL_emit(MR, EDUInfos); });
+  auto EmitQueries =
+      runSessionLocked([&]() { return IL_emit(MR, std::move(SR)); });
 
   // On error bail out.
-  if (!CompletedQueries)
-    return CompletedQueries.takeError();
+  if (!EmitQueries)
+    return EmitQueries.takeError();
 
-  MR.SymbolFlags.clear();
+  // Otherwise notify failed queries, and any updated queries that have been
+  // completed.
 
-  // Otherwise notify all the completed queries.
-  for (auto &Q : *CompletedQueries) {
-    assert(Q->isComplete() && "Q is not complete");
-    Q->handleComplete(*this);
+  // FIXME: Get rid of error return from notifyEmitted.
+  SymbolDependenceMap BadDeps;
+  {
+    for (auto &FQ : EmitQueries->Failed) {
+      FQ->detach();
+      assert(EmitQueries->FailedSymsForQuery.count(FQ.get()) &&
+             "Missing failed symbols for query");
+      auto FailedSyms = std::move(EmitQueries->FailedSymsForQuery[FQ.get()]);
+      for (auto &[JD, Syms] : *FailedSyms) {
+        auto &BadDepsForJD = BadDeps[JD];
+        for (auto &Sym : Syms)
+          BadDepsForJD.insert(Sym);
+      }
+      FQ->handleFailed(make_error<FailedToMaterialize>(getSymbolStringPool(),
+                                                       std::move(FailedSyms)));
+    }
+  }
+
+  for (auto &UQ : EmitQueries->Updated)
+    if (UQ->isComplete())
+      UQ->handleComplete(*this);
+
+  // If there are any bad dependencies then return an error.
+  if (!BadDeps.empty()) {
+    SymbolNameSet BadNames;
+    // Note: The name set calculated here is bogus: it includes all symbols in
+    //       the MR, not just the ones that failed. We want to remove the error
+    //       return path from notifyEmitted anyway, so this is just a brief
+    //       placeholder to maintain (roughly) the current error behavior.
+    for (auto &[Name, Flags] : MR.getSymbols())
+      BadNames.insert(Name);
+    MR.SymbolFlags.clear();
+    return make_error<UnsatisfiedSymbolDependencies>(
+        getSymbolStringPool(), &MR.getTargetJITDylib(), std::move(BadNames),
+        std::move(BadDeps), "dependencies removed or in error state");
   }
 
+  MR.SymbolFlags.clear();
   return Error::success();
 }
 
@@ -3535,158 +3093,48 @@ ExecutionSession::IL_failSymbols(JITDylib &JD,
 #endif
 
   JITDylib::AsynchronousSymbolQuerySet FailedQueries;
-  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
-  auto ExtractFailedQueries = [&](JITDylib::MaterializingInfo &MI) {
-    JITDylib::AsynchronousSymbolQueryList ToDetach;
-    for (auto &Q : MI.pendingQueries()) {
-      // Add the query to the list to be failed and detach it.
-      FailedQueries.insert(Q);
-      ToDetach.push_back(Q);
+  auto Fail = [&](JITDylib *FailJD, NonOwningSymbolStringPtr FailSym) {
+    auto I = FailJD->Symbols.find_as(FailSym);
+    assert(I != FailJD->Symbols.end());
+    I->second.setFlags(I->second.getFlags() | JITSymbolFlags::HasError);
+    auto J = FailJD->MaterializingInfos.find_as(FailSym);
+    if (J != FailJD->MaterializingInfos.end()) {
+      for (auto &Q : J->second.takeAllPendingQueries())
+        FailedQueries.insert(std::move(Q));
+      FailJD->MaterializingInfos.erase(J);
     }
-    for (auto &Q : ToDetach)
-      Q->detach();
-    assert(!MI.hasQueriesPending() && "Queries still pending after detach");
   };
 
-  for (auto &Name : SymbolsToFail) {
-    (*FailedSymbolsMap)[&JD].insert(Name);
-
-    // Look up the symbol to fail.
-    auto SymI = JD.Symbols.find(Name);
-
-    // FIXME: Revisit this. We should be able to assert sequencing between
-    //        ResourceTracker removal and symbol failure.
-    //
-    // It's possible that this symbol has already been removed, e.g. if a
-    // materialization failure happens concurrently with a ResourceTracker or
-    // JITDylib removal. In that case we can safely skip this symbol and
-    // continue.
-    if (SymI == JD.Symbols.end())
-      continue;
-    auto &Sym = SymI->second;
-
-    // If the symbol is already in the error state then we must have visited
-    // it earlier.
-    if (Sym.getFlags().hasError()) {
-      assert(!JD.MaterializingInfos.count(Name) &&
-             "Symbol in error state still has MaterializingInfo");
-      continue;
-    }
+  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
 
-    // Move the symbol into the error state.
-    Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError);
-
-    // FIXME: Come up with a sane mapping of state to
-    // presence-of-MaterializingInfo so that we can assert presence / absence
-    // here, rather than testing it.
-    auto MII = JD.MaterializingInfos.find(Name);
-    if (MII == JD.MaterializingInfos.end())
-      continue;
-
-    auto &MI = MII->second;
-
-    // Collect queries to be failed for this MII.
-    ExtractFailedQueries(MI);
-
-    if (MI.DefiningEDU) {
-      // If there is a DefiningEDU for this symbol then remove this
-      // symbol from it.
-      assert(MI.DependantEDUs.empty() &&
-             "Symbol with DefiningEDU should not have DependantEDUs");
-      assert(Sym.getState() >= SymbolState::Emitted &&
-             "Symbol has EDU, should have been emitted");
-      assert(MI.DefiningEDU->Symbols.count(NonOwningSymbolStringPtr(Name)) &&
-             "Symbol does not appear in its DefiningEDU");
-      MI.DefiningEDU->Symbols.erase(NonOwningSymbolStringPtr(Name));
-
-      // Remove this EDU from the dependants lists of its dependencies.
-      for (auto &[DepJD, DepSyms] : MI.DefiningEDU->Dependencies) {
-        for (auto DepSym : DepSyms) {
-          assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) &&
-                 "DepSym not in DepJD");
-          assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) &&
-                 "DepSym has not MaterializingInfo");
-          auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)];
-          assert(SymMI.DependantEDUs.count(MI.DefiningEDU.get()) &&
-                 "DefiningEDU missing from DependantEDUs list of dependency");
-          SymMI.DependantEDUs.erase(MI.DefiningEDU.get());
-        }
-      }
+  {
+    auto &FailedSymsForJD = (*FailedSymbolsMap)[&JD];
+    for (auto &Sym : SymbolsToFail) {
+      FailedSymsForJD.insert(Sym);
+      Fail(&JD, NonOwningSymbolStringPtr(Sym));
+    }
+  }
 
-      MI.DefiningEDU = nullptr;
-    } else {
-      // Otherwise if there are any EDUs waiting on this symbol then move
-      // those symbols to the error state too, and deregister them from the
-      // symbols that they depend on.
-      // Note: We use a copy of DependantEDUs here since we'll be removing
-      // from the original set as we go.
-      for (auto &DependantEDU : MI.DependantEDUs) {
-
-        // Remove DependantEDU from all of its users DependantEDUs lists.
-        for (auto &[DepJD, DepSyms] : DependantEDU->Dependencies) {
-          for (auto DepSym : DepSyms) {
-            // Skip self-reference to avoid invalidating the MI.DependantEDUs
-            // map. We'll clear this later.
-            if (DepJD == &JD && DepSym == Name)
-              continue;
-            assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) &&
-                   "DepSym not in DepJD?");
-            assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) &&
-                   "DependantEDU not registered with symbol it depends on");
-            auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)];
-            assert(SymMI.DependantEDUs.count(DependantEDU) &&
-                   "DependantEDU missing from DependantEDUs list");
-            SymMI.DependantEDUs.erase(DependantEDU);
-          }
-        }
+  WaitingOnGraph::ContainerElementsMap ToFail;
+  auto &JDToFail = ToFail[&JD];
+  for (auto &Sym : SymbolsToFail)
+    JDToFail.insert(NonOwningSymbolStringPtr(Sym));
 
-        // Move any symbols defined by DependantEDU into the error state and
-        // fail any queries waiting on them.
-        auto &DepJD = *DependantEDU->JD;
-        auto DepEDUSymbols = std::move(DependantEDU->Symbols);
-        for (auto &[DepName, Flags] : DepEDUSymbols) {
-          auto DepSymItr = DepJD.Symbols.find(SymbolStringPtr(DepName));
-          assert(DepSymItr != DepJD.Symbols.end() &&
-                 "Symbol not present in table");
-          auto &DepSym = DepSymItr->second;
-
-          assert(DepSym.getState() >= SymbolState::Emitted &&
-                 "Symbol has EDU, should have been emitted");
-          assert(!DepSym.getFlags().hasError() &&
-                 "Symbol is already in the error state?");
-          DepSym.setFlags(DepSym.getFlags() | JITSymbolFlags::HasError);
-          (*FailedSymbolsMap)[&DepJD].insert(SymbolStringPtr(DepName));
-
-          // This symbol has a defining EDU so its MaterializingInfo object must
-          // exist.
-          auto DepMIItr =
-              DepJD.MaterializingInfos.find(SymbolStringPtr(DepName));
-          assert(DepMIItr != DepJD.MaterializingInfos.end() &&
-                 "Symbol has defining EDU but not MaterializingInfo");
-          auto &DepMI = DepMIItr->second;
-          assert(DepMI.DefiningEDU.get() == DependantEDU &&
-                 "Bad EDU dependence edge");
-          assert(DepMI.DependantEDUs.empty() &&
-                 "Symbol was emitted, should not have any DependantEDUs");
-          ExtractFailedQueries(DepMI);
-          DepJD.MaterializingInfos.erase(SymbolStringPtr(DepName));
-        }
+  auto FailedSNs = G.fail(ToFail);
 
-        DepJD.shrinkMaterializationInfoMemory();
+  for (auto &SN : FailedSNs) {
+    for (auto &[FailJD, Defs] : SN->defs()) {
+      auto &FailedSymsForFailJD = (*FailedSymbolsMap)[FailJD];
+      for (auto &Def : Defs) {
+        FailedSymsForFailJD.insert(SymbolStringPtr(Def));
+        Fail(FailJD, Def);
       }
-
-      MI.DependantEDUs.clear();
     }
-
-    assert(!MI.DefiningEDU && "DefiningEDU should have been reset");
-    assert(MI.DependantEDUs.empty() &&
-           "DependantEDUs should have been removed above");
-    assert(!MI.hasQueriesPending() &&
-           "Can not delete MaterializingInfo with queries pending");
-    JD.MaterializingInfos.erase(Name);
   }
 
-  JD.shrinkMaterializationInfoMemory();
+  // Detach all failed queries.
+  for (auto &Q : FailedQueries)
+    Q->detach();
 
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("exiting ExecutionSession::IL_failSymbols");
@@ -3721,9 +3169,11 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
     return IL_failSymbols(MR.getTargetJITDylib(), SymbolsToFail);
   });
 
-  for (auto &Q : FailedQueries)
+  for (auto &Q : FailedQueries) {
+    Q->detach();
     Q->handleFailed(
         make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
+  }
 }
 
 Error ExecutionSession::OL_replace(MaterializationResponsibility &MR,
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index dec1df7..893523c 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -448,7 +448,7 @@ Error SimpleRemoteEPC::handleHangup(SimpleRemoteEPCArgBytesVector ArgBytes) {
   if (const char *ErrMsg = WFR.getOutOfBandError())
     return make_error<StringError>(ErrMsg, inconvertibleErrorCode());
 
-  detail::SPSSerializableError Info;
+  orc::shared::detail::SPSSerializableError Info;
   SPSInputBuffer IB(WFR.data(), WFR.size());
   if (!SPSArgList<SPSError>::deserialize(IB, Info))
     return make_error<StringError>("Could not deserialize hangup info",
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 3908a78..488b078 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3196,7 +3196,7 @@ void AssemblyWriter::printModuleSummaryIndex() {
   // for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID).
   for (auto &GlobalList : *TheIndex) {
     auto GUID = GlobalList.first;
-    for (auto &Summary : GlobalList.second.SummaryList)
+    for (auto &Summary : GlobalList.second.getSummaryList())
       SummaryToGUIDMap[Summary.get()] = GUID;
   }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 10f915d..7e5e7b5 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6045,6 +6045,120 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 }
 
+// Check if the function attribute is not present and set it.
+static void setFunctionAttrIfNotSet(Function &F, StringRef FnAttrName,
+                                    StringRef Value) {
+  if (!F.hasFnAttribute(FnAttrName))
+    F.addFnAttr(FnAttrName, Value);
+}
+
+// Check if the function attribute is not present and set it if needed.
+// If the attribute is "false" then removes it.
+// If the attribute is "true" resets it to a valueless attribute.
+static void ConvertFunctionAttr(Function &F, bool Set, StringRef FnAttrName) {
+  if (!F.hasFnAttribute(FnAttrName)) {
+    if (Set)
+      F.addFnAttr(FnAttrName);
+  } else {
+    auto A = F.getFnAttribute(FnAttrName);
+    if ("false" == A.getValueAsString())
+      F.removeFnAttr(FnAttrName);
+    else if ("true" == A.getValueAsString()) {
+      F.removeFnAttr(FnAttrName);
+      F.addFnAttr(FnAttrName);
+    }
+  }
+}
+
+void llvm::copyModuleAttrToFunctions(Module &M) {
+  Triple T(M.getTargetTriple());
+  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
+    return;
+
+  uint64_t BTEValue = 0;
+  uint64_t BPPLRValue = 0;
+  uint64_t GCSValue = 0;
+  uint64_t SRAValue = 0;
+  uint64_t SRAALLValue = 0;
+  uint64_t SRABKeyValue = 0;
+
+  NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
+  if (ModFlags) {
+    for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
+      MDNode *Op = ModFlags->getOperand(I);
+      if (Op->getNumOperands() != 3)
+        continue;
+
+      MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
+      auto *CI = mdconst::dyn_extract<ConstantInt>(Op->getOperand(2));
+      if (!ID || !CI)
+        continue;
+
+      StringRef IDStr = ID->getString();
+      uint64_t *ValPtr = IDStr == "branch-target-enforcement"    ? &BTEValue
+                         : IDStr == "branch-protection-pauth-lr" ? &BPPLRValue
+                         : IDStr == "guarded-control-stack"      ? &GCSValue
+                         : IDStr == "sign-return-address"        ? &SRAValue
+                         : IDStr == "sign-return-address-all"    ? &SRAALLValue
+                         : IDStr == "sign-return-address-with-bkey"
+                             ? &SRABKeyValue
+                             : nullptr;
+      if (!ValPtr)
+        continue;
+
+      *ValPtr = CI->getZExtValue();
+      if (*ValPtr == 2)
+        return;
+    }
+  }
+
+  bool BTE = BTEValue == 1;
+  bool BPPLR = BPPLRValue == 1;
+  bool GCS = GCSValue == 1;
+  bool SRA = SRAValue == 1;
+
+  StringRef SignTypeValue = "non-leaf";
+  if (SRA && SRAALLValue == 1)
+    SignTypeValue = "all";
+
+  StringRef SignKeyValue = "a_key";
+  if (SRA && SRABKeyValue == 1)
+    SignKeyValue = "b_key";
+
+  for (Function &F : M.getFunctionList()) {
+    if (F.isDeclaration())
+      continue;
+
+    if (SRA) {
+      setFunctionAttrIfNotSet(F, "sign-return-address", SignTypeValue);
+      setFunctionAttrIfNotSet(F, "sign-return-address-key", SignKeyValue);
+    } else {
+      if (auto A = F.getFnAttribute("sign-return-address");
+          A.isValid() && "none" == A.getValueAsString()) {
+        F.removeFnAttr("sign-return-address");
+        F.removeFnAttr("sign-return-address-key");
+      }
+    }
+    ConvertFunctionAttr(F, BTE, "branch-target-enforcement");
+    ConvertFunctionAttr(F, BPPLR, "branch-protection-pauth-lr");
+    ConvertFunctionAttr(F, GCS, "guarded-control-stack");
+  }
+
+  if (BTE)
+    M.setModuleFlag(llvm::Module::Min, "branch-target-enforcement", 2);
+  if (BPPLR)
+    M.setModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", 2);
+  if (GCS)
+    M.setModuleFlag(llvm::Module::Min, "guarded-control-stack", 2);
+  if (SRA) {
+    M.setModuleFlag(llvm::Module::Min, "sign-return-address", 2);
+    if (SRAALLValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-all", 2);
+    if (SRABKeyValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 2);
+  }
+}
+
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 9060a89..3b8fde8 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2878,7 +2878,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
     { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc        |
     { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
-    {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
+    {  0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
     { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index dc55b63..a6353664 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -162,7 +162,7 @@ void ModuleSummaryIndex::collectDefinedFunctionsForModule(
     StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const {
   for (auto &GlobalList : *this) {
     auto GUID = GlobalList.first;
-    for (auto &GlobSummary : GlobalList.second.SummaryList) {
+    for (auto &GlobSummary : GlobalList.second.getSummaryList()) {
       auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get());
       if (!Summary)
         // Ignore global variable, focus on functions
@@ -263,7 +263,7 @@ void ModuleSummaryIndex::propagateAttributes(
   DenseSet<ValueInfo> MarkedNonReadWriteOnly;
   for (auto &P : *this) {
     bool IsDSOLocal = true;
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       if (!isGlobalValueLive(S.get())) {
         // computeDeadSymbolsAndUpdateIndirectCalls should have marked all
         // copies live. Note that it is possible that there is a GUID collision
@@ -273,7 +273,7 @@ void ModuleSummaryIndex::propagateAttributes(
         // all copies live we can assert here that all are dead if any copy is
         // dead.
         assert(llvm::none_of(
-            P.second.SummaryList,
+            P.second.getSummaryList(),
             [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
               return isGlobalValueLive(Summary.get());
             }));
@@ -308,16 +308,16 @@ void ModuleSummaryIndex::propagateAttributes(
       // Mark the flag in all summaries false so that we can do quick check
       // without going through the whole list.
       for (const std::unique_ptr<GlobalValueSummary> &Summary :
-           P.second.SummaryList)
+           P.second.getSummaryList())
         Summary->setDSOLocal(false);
   }
   setWithAttributePropagation();
   setWithDSOLocalPropagation();
   if (llvm::AreStatisticsEnabled())
     for (auto &P : *this)
-      if (P.second.SummaryList.size())
+      if (P.second.getSummaryList().size())
         if (auto *GVS = dyn_cast<GlobalVarSummary>(
-                P.second.SummaryList[0]->getBaseObject()))
+                P.second.getSummaryList()[0]->getBaseObject()))
           if (isGlobalValueLive(GVS)) {
             if (GVS->maybeReadOnly())
               ReadOnlyLiveGVars++;
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 7ea2e46..77af29b 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -21,9 +21,6 @@ using namespace RTLIB;
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 #include "llvm/IR/RuntimeLibcalls.inc"
-#undef GET_INIT_RUNTIME_LIBCALL_NAMES
-#undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
-#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index aec8891..cbc0b1d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -457,7 +457,7 @@ void llvm::thinLTOResolvePrevailingInIndex(
   // when needed.
   DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
   for (auto &I : Index)
-    for (auto &S : I.second.SummaryList)
+    for (auto &S : I.second.getSummaryList())
       if (auto AS = dyn_cast<AliasSummary>(S.get()))
         GlobalInvolvedWithAlias.insert(&AS->getAliasee());
 
@@ -1182,7 +1182,7 @@ Error LTO::checkPartiallySplit() {
   // Otherwise check if there are any recorded in the combined summary from the
   // ThinLTO modules.
   for (auto &P : ThinLTO.CombinedIndex) {
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *FS = dyn_cast<FunctionSummary>(S.get());
       if (!FS)
         continue;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 280c3d1..93118be 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -770,11 +770,11 @@ bool lto::initImportList(const Module &M,
   // via a WriteIndexesThinBackend.
   for (const auto &GlobalList : CombinedIndex) {
     // Ignore entries for undefined references.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
     auto GUID = GlobalList.first;
-    for (const auto &Summary : GlobalList.second.SummaryList) {
+    for (const auto &Summary : GlobalList.second.getSummaryList()) {
       // Skip the summaries for the importing module. These are included to
       // e.g. record required linkage changes.
       if (Summary->modulePath() == M.getModuleIdentifier())
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 5b333cd..ff94c54 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -101,8 +101,8 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
   WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
 }
 
-static const GlobalValueSummary *
-getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
+static const GlobalValueSummary *getFirstDefinitionForLinker(
+    ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
   // If there is any strong definition anywhere, get it.
   auto StrongDefForLinker = llvm::find_if(
       GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
@@ -131,14 +131,15 @@ getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
 static void computePrevailingCopies(
     const ModuleSummaryIndex &Index,
     DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) {
-  auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) {
-    return GVSummaryList.size() > 1;
-  };
+  auto HasMultipleCopies =
+      [&](ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
+        return GVSummaryList.size() > 1;
+      };
 
   for (auto &I : Index) {
-    if (HasMultipleCopies(I.second.SummaryList))
+    if (HasMultipleCopies(I.second.getSummaryList()))
       PrevailingCopy[I.first] =
-          getFirstDefinitionForLinker(I.second.SummaryList);
+          getFirstDefinitionForLinker(I.second.getSummaryList());
   }
 }
 
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 1bff6cd..f78d9b0 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1512,6 +1512,11 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(*SrcM);
+
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
@@ -1677,6 +1682,11 @@ IRMover::IRMover(Module &M) : Composite(M) {
   for (const auto *MD : StructTypes.getVisitedMetadata()) {
     SharedMDs[MD].reset(const_cast<MDNode *>(MD));
   }
+
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(M);
 }
 
 Error IRMover::move(std::unique_ptr<Module> Src,
diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp
new file mode 100644
index 0000000..95ecda2
--- /dev/null
+++ b/llvm/lib/Support/AllocToken.cpp
@@ -0,0 +1,50 @@
+//===- AllocToken.cpp - Allocation Token Calculation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AllocToken.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SipHash.h"
+
+using namespace llvm;
+
+static uint64_t getStableHash(const AllocTokenMetadata &Metadata,
+                              uint64_t MaxTokens) {
+  return getStableSipHash(Metadata.TypeName) % MaxTokens;
+}
+
+std::optional<uint64_t> llvm::getAllocToken(AllocTokenMode Mode,
+                                            const AllocTokenMetadata &Metadata,
+                                            uint64_t MaxTokens) {
+  assert(MaxTokens && "Must provide non-zero max tokens");
+
+  switch (Mode) {
+  case AllocTokenMode::Increment:
+  case AllocTokenMode::Random:
+    // Stateful modes cannot be implemented as a pure function.
+    return std::nullopt;
+
+  case AllocTokenMode::TypeHash:
+    return getStableHash(Metadata, MaxTokens);
+
+  case AllocTokenMode::TypeHashPointerSplit: {
+    if (MaxTokens == 1)
+      return 0;
+    const uint64_t HalfTokens = MaxTokens / 2;
+    uint64_t Hash = getStableHash(Metadata, HalfTokens);
+    if (Metadata.ContainsPointer)
+      Hash += HalfTokens;
+    return Hash;
+  }
+  }
+
+  llvm_unreachable("");
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 42b21b5..671a5fe 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport
   AArch64BuildAttributes.cpp
   ARMAttributeParser.cpp
   ARMWinEH.cpp
+  AllocToken.cpp
   Allocator.cpp
   AutoConvert.cpp
   Base64.cpp
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 549c418..f74e52a 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -111,7 +111,7 @@ Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
   return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
 }
 
-LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) {
+void SpecialCaseList::Matcher::preprocess(bool BySize) {
   return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b..a81de5c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N,
 static SDValue
 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                              SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
+  if (DCI.isAfterLegalizeDAG()) {
+    if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
+                                             N->getOperand(0)))
+      return SDValue(LN, 0);
+  }
+
   // Let's do below transform.
   //
   //         t34: v4i32 = AArch64ISD::UADDLV t2
@@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   // Let's generate new sequence with AArch64ISD::NVCAST.
-  SDLoc DL(N);
   SDValue EXTRACT_SUBVEC =
       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
                   DAG.getConstant(0, DL, MVT::i64));
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e345..e3370d3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5722,7 +5722,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   }
 
   // Add additional cost for the extends that would need to be inserted.
-  return Cost + 4;
+  return Cost + 2;
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..30a1f05
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,73 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to
+///       barrier edges between ATOMIC_FENCE instructions and preceding
+///       memory accesses potentially affected by the fence.
+///       This encourages the scheduling of more instructions before
+///       ATOMIC_FENCE instructions.  ATOMIC_FENCE instructions may
+///       introduce wait counting or indicate an impending S_BARRIER
+///       wait.  Having more instructions in-flight across these
+///       constructs improves latency hiding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+public:
+  BarrierLatency() = default;
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+  constexpr unsigned SyntheticLatency = 2000;
+  for (SUnit &SU : DAG->SUnits) {
+    const MachineInstr *MI = SU.getInstr();
+    if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+      continue;
+
+    // Update latency on barrier edges of ATOMIC_FENCE.
+    // We don't consider the scope of the fence or type of instruction
+    // involved in the barrier edge.
+    for (SDep &PredDep : SU.Preds) {
+      if (!PredDep.isBarrier())
+        continue;
+      SUnit *PredSU = PredDep.getSUnit();
+      MachineInstr *MI = PredSU->getInstr();
+      // Only consider memory loads
+      if (!MI->mayLoad() || MI->mayStore())
+        continue;
+      SDep ForwardD = PredDep;
+      ForwardD.setSUnit(&SU);
+      for (SDep &SuccDep : PredSU->Succs) {
+        if (SuccDep == ForwardD) {
+          SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
+          break;
+        }
+      }
+      PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
+      PredSU->setDepthDirty();
+      SU.setDepthDirty();
+    }
+  }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation() {
+  return std::make_unique<BarrierLatency>();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..c23f0b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,21 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..996b55f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUBarrierLatency.h"
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
@@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 
@@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 
@@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
       EnableVOPD)
     DAG->addMutation(createVOPDPairingMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..a1e0e52 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAsmPrinter.cpp
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
+  AMDGPUBarrierLatency.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 50447f4..2ff2d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4032,28 +4032,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
   }
 }
 
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+  /// Other instruction whose def is no longer used by the converted
+  /// instruction.
+  MachineInstr *RemoveMIUse = nullptr;
+};
+
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
-  unsigned Opc = MI.getOpcode();
+  ThreeAddressUpdates U;
+  MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
 
-  // Handle MFMA.
-  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
-  if (NewMFMAOpc != -1) {
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
-      MIB.add(MI.getOperand(I));
-    updateLiveVariables(LV, MI, *MIB);
+  if (NewMI) {
+    updateLiveVariables(LV, MI, *NewMI);
     if (LIS) {
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+      LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
       // SlotIndex of defs needs to be updated when converting to early-clobber
-      MachineOperand &Def = MIB->getOperand(0);
+      MachineOperand &Def = NewMI->getOperand(0);
       if (Def.isEarlyClobber() && Def.isReg() &&
           LIS->hasInterval(Def.getReg())) {
-        SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
-        SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+        SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+        SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
         auto &LI = LIS->getInterval(Def.getReg());
         auto UpdateDefIndex = [&](LiveRange &LR) {
           auto *S = LR.find(OldIndex);
@@ -4068,6 +4071,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
           UpdateDefIndex(SR);
       }
     }
+  }
+
+  if (U.RemoveMIUse) {
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    // The only user is the instruction which will be killed.
+    Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+    if (MRI.hasOneNonDBGUse(DefReg)) {
+      // We cannot just remove the DefMI here, calling pass will crash.
+      U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+      U.RemoveMIUse->getOperand(0).setIsDead(true);
+      for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+        U.RemoveMIUse->removeOperand(I);
+      if (LV)
+        LV->getVarInfo(DefReg).AliveBlocks.clear();
+    }
+
+    if (LIS) {
+      LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+      // We cannot delete the original instruction here, so hack out the use
+      // in the original instruction with a dummy register so we can use
+      // shrinkToUses to deal with any multi-use edge cases. Other targets do
+      // not have the complexity of deleting a use to consider here.
+      Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+      for (MachineOperand &MIOp : MI.uses()) {
+        if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+          MIOp.setIsUndef(true);
+          MIOp.setReg(DummyReg);
+        }
+      }
+
+      LIS->shrinkToUses(&DefLI);
+    }
+  }
+
+  return NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+                                       ThreeAddressUpdates &U) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  unsigned Opc = MI.getOpcode();
+
+  // Handle MFMA.
+  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+  if (NewMFMAOpc != -1) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
     return MIB;
   }
 
@@ -4077,11 +4132,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                   .setMIFlags(MI.getFlags());
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
       MIB->addOperand(MI.getOperand(I));
-
-    updateLiveVariables(LV, MI, *MIB);
-    if (LIS)
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
     return MIB;
   }
 
@@ -4152,39 +4202,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
     MachineInstr *DefMI;
-    const auto killDef = [&]() -> void {
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      // The only user is the instruction which will be killed.
-      Register DefReg = DefMI->getOperand(0).getReg();
-
-      if (MRI.hasOneNonDBGUse(DefReg)) {
-        // We cannot just remove the DefMI here, calling pass will crash.
-        DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
-        DefMI->getOperand(0).setIsDead(true);
-        for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
-          DefMI->removeOperand(I);
-        if (LV)
-          LV->getVarInfo(DefReg).AliveBlocks.clear();
-      }
-
-      if (LIS) {
-        LiveInterval &DefLI = LIS->getInterval(DefReg);
-
-        // We cannot delete the original instruction here, so hack out the use
-        // in the original instruction with a dummy register so we can use
-        // shrinkToUses to deal with any multi-use edge cases. Other targets do
-        // not have the complexity of deleting a use to consider here.
-        Register DummyReg = MRI.cloneVirtualRegister(DefReg);
-        for (MachineOperand &MIOp : MI.uses()) {
-          if (MIOp.isReg() && MIOp.getReg() == DefReg) {
-            MIOp.setIsUndef(true);
-            MIOp.setReg(DummyReg);
-          }
-        }
-
-        LIS->shrinkToUses(&DefLI);
-      }
-    };
 
     int64_t Imm;
     if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4196,10 +4213,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .add(*Src1)
                   .addImm(Imm)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4212,11 +4226,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4235,12 +4245,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        if (DefMI)
-          killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4269,9 +4274,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
             .setMIFlags(MI.getFlags());
   if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
     MIB.addImm(OpSel ? OpSel->getImm() : 0);
-  updateLiveVariables(LV, MI, *MIB);
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *MIB);
   return MIB;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index df27ec1..e1d7a07 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -88,6 +88,8 @@ private:
 };
 
 class SIInstrInfo final : public AMDGPUGenInstrInfo {
+  struct ThreeAddressUpdates;
+
 private:
   const SIRegisterInfo RI;
   const GCNSubtarget &ST;
@@ -190,6 +192,9 @@ private:
 
   bool resultDependsOnExec(const MachineInstr &MI) const;
 
+  MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+                                          ThreeAddressUpdates &Updates) const;
+
 protected:
   /// If the specific machine instruction is a instruction that moves/copies
   /// value from one register to another register return destination and source
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 74d4153..6f1feb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2223,8 +2223,8 @@ def : GCNPat <
 
 def : GCNPat <
   (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
-  (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
-                11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+  (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+                !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
                 0, 0, 0, 0, 0)
 > {
   let SubtargetPredicate = HasPackedFP32Ops;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bdbc000..07264d9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -397,12 +397,6 @@ public:
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;
 
-  /// Inserts any necessary instructions before the barrier start instruction
-  /// \p MI in order to support pairing of barriers and fences.
-  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
-    return false;
-  };
-
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };
@@ -583,12 +577,8 @@ public:
                   bool IsCrossAddrSpaceOrdering, Position Pos,
                   AtomicOrdering Order, bool AtomicsOnly) const override;
 
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
 };
 
 class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -2069,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       // the WGP. Therefore need to wait for operations to complete to ensure
       // they are visible to waves in the other CU as the L0 is per CU.
       // Otherwise in CU mode and all waves of a work-group are on the same CU
-      // which shares the same L0.
-      if (!ST.isCuModeEnabled()) {
+      // which shares the same L0. Note that we still need to wait when
+      // performing a release in this mode to respect the transitivity of
+      // happens-before, e.g. other waves of the workgroup must be able to
+      // release the memory from another wave at a wider scope.
+      if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           VMCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2225,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
-bool SIGfx10CacheControl::insertBarrierStart(
-    MachineBasicBlock::iterator &MI) const {
-  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
-  // mode. This is because a CU mode release fence does not emit any wait, which
-  // is fine when only dealing with vmem, but isn't sufficient in the presence
-  // of barriers which do not go through vmem.
-  // GFX12.5 does not require this additional wait.
-  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
-    return false;
-
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
-  return true;
-}
-
 bool SIGfx11CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
@@ -2419,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       //   In WGP mode the waves of a work-group can be executing on either CU
       //   of the WGP. Therefore need to wait for operations to complete to
       //   ensure they are visible to waves in the other CU as the L0 is per CU.
+      //
       //   Otherwise in CU mode and all waves of a work-group are on the same CU
-      //   which shares the same L0.
+      //   which shares the same L0. Note that we still need to wait when
+      //   performing a release in this mode to respect the transitivity of
+      //   happens-before, e.g. other waves of the workgroup must be able to
+      //   release the memory from another wave at a wider scope.
       //
       // GFX12.5:
       //   CU$ has two ports. To ensure operations are visible at the workgroup
       //   level, we need to ensure all operations in this port have completed
       //   so the other SIMDs in the WG can see them. There is no ordering
       //   guarantee between the ports.
-      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+          isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -3017,11 +2999,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         MI = II->getIterator();
       }
 
-      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
-        Changed |= CC->insertBarrierStart(MI);
-        continue;
-      }
-
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42ec8ba..7cce033 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
-  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 }
 
 defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1f773e2..3368a50 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() {
 
     auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
         SourceModule->getModuleFlag("branch-target-enforcement"));
-    if (BTIValue && BTIValue->isOne()) {
+    if (BTIValue && !BTIValue->isZero()) {
       // If "+pacbti" is used as an architecture extension,
       // Tag_BTI_extension is emitted in
       // ARMTargetStreamer::emitTargetAttributes().
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 35e1127..b1a668e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1089,7 +1089,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   // Register based DivRem for AEABI (RTABI 4.2)
   if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
-      TT.isTargetMuslAEABI() || TT.isOSWindows()) {
+      TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
@@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FLOG10, MVT::f16, Promote);
     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
     setOperationAction(ISD::LRINT, MVT::f16, Expand);
+    setOperationAction(ISD::LROUND, MVT::f16, Expand);
 
     setOperationAction(ISD::FROUND, MVT::f16, Legal);
     setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
@@ -20574,7 +20575,7 @@ static TargetLowering::ArgListTy getDivRemArgList(
 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
-          Subtarget->isTargetWindows()) &&
+          Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
          "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 96ee69c..597d311 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI,
       continue;
     // Skip the lr predicate reg
     int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
-    if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
+    if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg)
       continue;
 
     // Check that this instruction will produce zeros in its false lanes:
@@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   while (!Worklist.empty()) {
     MachineInstr *MI = Worklist.pop_back_val();
     if (MI->getOpcode() == ARM::MQPRCopy) {
+      LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI);
       VMOVCopies.insert(MI);
       MachineInstr *CopySrc =
           RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
@@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
       VMOVCopies.clear();
       return false;
+    } else if (isVectorPredicated(MI)) {
+      // If this is a predicated instruction with merging semantics,
+      // check where it gets its false lanes from, if any.
+      int InactiveIdx = findVPTInactiveOperandIdx(*MI);
+      if (InactiveIdx != -1) {
+        SmallPtrSet<MachineInstr *, 2> Defs;
+        MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
+            MI, MI->getOperand(InactiveIdx).getReg());
+        if (FalseSrc) {
+          LLVM_DEBUG(dbgs()
+                     << " Must check source of false lanes for: " << *MI);
+          Worklist.push_back(FalseSrc);
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index b2d368e..4a0883c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -343,6 +343,7 @@ public:
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
   bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5eeb4fe..413e844 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
     Register LR = LoopPhi->getOperand(0).getReg();
     for (MachineInstr *MI : MVEInstrs) {
       int Idx = findFirstVPTPredOperandIdx(*MI);
-      MI->getOperand(Idx + 2).setReg(LR);
+      MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR);
     }
   }
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 431ce38..f5653d4 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
   return -1;
 }
 
+int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R)
+      return i + ARM::SUBOP_vpred_r_inactive;
+
+  return -1;
+}
+
 ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
                                             Register &PredReg) {
   int PIdx = findFirstVPTPredOperandIdx(MI);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3ec3a621..1b0bf2d 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
   Register PredReg;
   return getVPTInstrPredicate(MI, PredReg);
 }
+// Identify the input operand in an MVE predicated instruction which
+// contributes the values of any inactive vector lanes.
+int findVPTInactiveOperandIdx(const MachineInstr &MI);
 
 // Recomputes the Block Mask of Instr, a VPT or VPST instruction.
 // This rebuilds the block mask of the instruction depending on the predicates
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index c8866bf..42e90f0 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -294,6 +294,14 @@ public:
     if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
       RootSignature->eraseFromParent();
 
+    // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
+    // causes all tests using the DXIL Validator to fail.
+    //
+    // This is a temporary fix and should be replaced with a whitelist once
+    // we have determined all metadata that the DXIL Validator allows
+    if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
+      ErrNo->eraseFromParent();
+
     return true;
   }
 
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index d758260..1a5f096 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(HexagonCodeGen
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
+  HexagonQFPOptimizer.cpp
   HexagonRDFOpt.cpp
   HexagonRegisterInfo.cpp
   HexagonSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 109aba5..422ab20 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -67,6 +67,8 @@ void initializeHexagonPeepholePass(PassRegistry &);
 void initializeHexagonSplitConst32AndConst64Pass(PassRegistry &);
 void initializeHexagonVectorPrintPass(PassRegistry &);
 
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
 Pass *createHexagonLoopIdiomPass();
 Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
 
@@ -112,6 +114,7 @@ FunctionPass *createHexagonVectorCombineLegacyPass();
 FunctionPass *createHexagonVectorPrint();
 FunctionPass *createHexagonVExtract();
 FunctionPass *createHexagonExpandCondsets();
+FunctionPass *createHexagonQFPOptimizer();
 
 } // end namespace llvm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 4ddbe7a..ff876f6 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -920,6 +920,10 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
   // successors have been processed.
   RegisterSet BlockDefs, InsDefs;
   for (MachineInstr &MI : *B) {
+    // Stop if the map size is too large.
+    if (IFMap.size() >= MaxIFMSize)
+      break;
+
     InsDefs.clear();
     getInstrDefs(&MI, InsDefs);
     // Leave those alone. They are more transparent than "insert".
@@ -942,8 +946,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
 
         findRecordInsertForms(VR, AVs);
         // Stop if the map size is too large.
-        if (IFMap.size() > MaxIFMSize)
-          return;
+        if (IFMap.size() >= MaxIFMSize)
+          break;
       }
     }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index a94e131..54c8972 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -117,8 +117,10 @@ HexagonTargetLowering::initializeHVXLowering() {
   setOperationAction(ISD::VECTOR_SHUFFLE,          ByteW, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  if (Subtarget.useHVX128BOps())
+  if (Subtarget.useHVX128BOps()) {
     setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v64i1, Custom);
+  }
   if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
       Subtarget.useHVXFloatingPoint()) {
 
@@ -2024,13 +2026,9 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcast from i32, v2i16, and v4i8 to v32i1.
   // Splat the input into a 32-element i32 vector, then AND each element
   // with a unique bitmask to isolate individual bits.
-  if (ResTy == MVT::v32i1 &&
-      (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
-      Subtarget.useHVX128BOps()) {
-    SDValue Val32 = Val;
-    if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
-      Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
-
+  auto bitcastI32ToV32I1 = [&](SDValue Val32) {
+    assert(Val32.getValueType().getSizeInBits() == 32 &&
+           "Input must be 32 bits");
     MVT VecTy = MVT::getVectorVT(MVT::i32, 32);
     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32);
     SmallVector<SDValue, 32> Mask;
@@ -2039,7 +2037,31 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask);
     SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec);
-    return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded);
+    return DAG.getNode(HexagonISD::V2Q, dl, MVT::v32i1, Anded);
+  };
+  // === Case: v32i1 ===
+  if (ResTy == MVT::v32i1 &&
+      (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
+      Subtarget.useHVX128BOps()) {
+    SDValue Val32 = Val;
+    if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
+      Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
+    return bitcastI32ToV32I1(Val32);
+  }
+  // === Case: v64i1 ===
+  if (ResTy == MVT::v64i1 && ValTy == MVT::i64 && Subtarget.useHVX128BOps()) {
+    // Split i64 into lo/hi 32-bit halves.
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Val);
+    SDValue HiShifted = DAG.getNode(ISD::SRL, dl, MVT::i64, Val,
+                                    DAG.getConstant(32, dl, MVT::i64));
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, HiShifted);
+
+    // Reuse the same 32-bit logic twice.
+    SDValue LoRes = bitcastI32ToV32I1(Lo);
+    SDValue HiRes = bitcastI32ToV32I1(Hi);
+
+    // Concatenate into a v64i1 predicate.
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, LoRes, HiRes);
   }
 
   if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) {
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
new file mode 100644
index 0000000..479ac90
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -0,0 +1,334 @@
+//===----- HexagonQFPOptimizer.cpp - Qualcomm-FP to IEEE-FP conversions
+// optimizer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Basic infrastructure for optimizing intermediate conversion instructions
+// generated while performing vector floating point operations.
+// Currently run at the starting of the code generation for Hexagon, cleans
+// up redundant conversion instructions and replaces the uses of conversion
+// with appropriate machine operand. Liveness is preserved after this pass.
+//
+// @note: The redundant conversion instructions are not eliminated in this pass.
+// In this pass, we are only trying to replace the uses of conversion
+// instructions with its appropriate QFP instruction. We are leaving the job to
+// Dead instruction Elimination pass to remove redundant conversion
+// instructions.
+//
+// Brief overview of working of this QFP optimizer.
+// This version of Hexagon QFP optimizer basically iterates over each
+// instruction, checks whether if it belongs to hexagon floating point HVX
+// arithmetic instruction category(Add, Sub, Mul). And then it finds the unique
+// definition for the machine operands corresponding to the instruction.
+//
+// Example:
+// MachineInstruction *MI be the HVX vadd instruction
+// MI -> $v0 = V6_vadd_sf $v1, $v2
+// MachineOperand *DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+// MachineOperand *DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+//
+// In the above example, DefMI1 and DefMI2 gives the unique definitions
+// corresponding to the operands($v1 and &v2 respectively) of instruction MI.
+//
+// If both of the definitions are not conversion instructions(V6_vconv_sf_qf32,
+// V6_vconv_hf_qf16), then it will skip optimizing the current instruction and
+// iterates over next instruction.
+//
+// If one the definitions is conversion instruction then our pass will replace
+// the arithmetic instruction with its corresponding mix variant.
+// In the above example, if $v1 is conversion instruction
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32_mix $v3, $v2 ($v1 is replaced with $v3)
+//
+// If both the definitions are conversion instructions then the instruction will
+// be replaced with its qf variant
+// In the above example, if $v1 and $v2 are conversion instructions
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// DefMI2 -> $v2 = V6_vconv_sf_qf32 $v4
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32 $v3, $v4 ($v1 is replaced with $v3, $v2 is replaced
+// with $v4)
+//
+// Currently, in this pass, we are not handling the case when the definitions
+// are PHI inst.
+//
+//===----------------------------------------------------------------------===//
+#include <unordered_set>
+#define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-qfp-optimizer"
+
+using namespace llvm;
+
+cl::opt<bool>
+    DisableQFOptimizer("disable-qfp-opt", cl::init(false),
+                       cl::desc("Disable optimization of Qfloat operations."));
+
+namespace {
+const std::map<unsigned short, unsigned short> QFPInstMap{
+    {Hexagon::V6_vadd_hf, Hexagon::V6_vadd_qf16_mix},
+    {Hexagon::V6_vadd_qf16_mix, Hexagon::V6_vadd_qf16},
+    {Hexagon::V6_vadd_sf, Hexagon::V6_vadd_qf32_mix},
+    {Hexagon::V6_vadd_qf32_mix, Hexagon::V6_vadd_qf32},
+    {Hexagon::V6_vsub_hf, Hexagon::V6_vsub_qf16_mix},
+    {Hexagon::V6_vsub_qf16_mix, Hexagon::V6_vsub_qf16},
+    {Hexagon::V6_vsub_sf, Hexagon::V6_vsub_qf32_mix},
+    {Hexagon::V6_vsub_qf32_mix, Hexagon::V6_vsub_qf32},
+    {Hexagon::V6_vmpy_qf16_hf, Hexagon::V6_vmpy_qf16_mix_hf},
+    {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
+    {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
+    {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+} // namespace
+
+namespace llvm {
+
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
+} // namespace llvm
+
+namespace {
+
+struct HexagonQFPOptimizer : public MachineFunctionPass {
+public:
+  static char ID;
+
+  HexagonQFPOptimizer() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const HexagonSubtarget *HST = nullptr;
+  const HexagonInstrInfo *HII = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+};
+
+char HexagonQFPOptimizer::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(HexagonQFPOptimizer, "hexagon-qfp-optimizer",
+                HEXAGON_QFP_OPTIMIZER, false, false)
+
+FunctionPass *llvm::createHexagonQFPOptimizer() {
+  return new HexagonQFPOptimizer();
+}
+
+bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
+                                      MachineBasicBlock *MBB) {
+
+  // Early exit:
+  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
+  // + 1 dest),
+  // - or does not have a transformation mapping.
+  if (MI->getNumOperands() < 3)
+    return false;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+  unsigned short InstTy = It->second;
+
+  unsigned Op0F = 0;
+  unsigned Op1F = 0;
+  // Get the reaching defs of MI, DefMI1 and DefMI2
+  MachineInstr *DefMI1 = nullptr;
+  MachineInstr *DefMI2 = nullptr;
+
+  if (MI->getOperand(1).isReg())
+    DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (MI->getOperand(2).isReg())
+    DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+  if (!DefMI1 || !DefMI2)
+    return false;
+
+  MachineOperand &Res = MI->getOperand(0);
+  MachineInstr *Inst1 = nullptr;
+  MachineInstr *Inst2 = nullptr;
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
+             DefMI2->dump());
+
+  // Get the reaching defs of DefMI
+  if (DefMI1->getNumOperands() > 1 && DefMI1->getOperand(1).isReg() &&
+      DefMI1->getOperand(1).getReg().isVirtual())
+    Inst1 = MRI->getVRegDef(DefMI1->getOperand(1).getReg());
+
+  if (DefMI2->getNumOperands() > 1 && DefMI2->getOperand(1).isReg() &&
+      DefMI2->getOperand(1).getReg().isVirtual())
+    Inst2 = MRI->getVRegDef(DefMI2->getOperand(1).getReg());
+
+  unsigned Def1OP = DefMI1->getOpcode();
+  unsigned Def2OP = DefMI2->getOpcode();
+
+  MachineInstrBuilder MIB;
+  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+  if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+       Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+      (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+       Def2OP == Hexagon::V6_vconv_hf_qf16)) {
+
+    // If the reaching defs of DefMI are W register type, we return
+    if ((Inst1 && Inst1->getNumOperands() > 0 && Inst1->getOperand(0).isReg() &&
+         MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+             &Hexagon::HvxWRRegClass) ||
+        (Inst2 && Inst2->getNumOperands() > 0 && Inst2->getOperand(0).isReg() &&
+         MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+             &Hexagon::HvxWRRegClass))
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &Src1 = DefMI1->getOperand(1);
+    MachineOperand &Src2 = DefMI2->getOperand(1);
+
+    Op0F = getKillRegState(Src1.isKill());
+    Src1.setIsKill(false);
+
+    Op1F = getKillRegState(Src2.isKill());
+    Src2.setIsKill(false);
+
+    if (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf) {
+      auto OuterIt = QFPInstMap.find(MI->getOpcode());
+      if (OuterIt == QFPInstMap.end())
+        return false;
+      auto InnerIt = QFPInstMap.find(OuterIt->second);
+      if (InnerIt == QFPInstMap.end())
+        return false;
+      InstTy = InnerIt->second;
+    }
+
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+              .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+
+    // Case 2: Left operand is conversion to sf/hf
+  } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+               Def2OP != Hexagon::V6_vconv_sf_qf32) ||
+              (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+               Def2OP != Hexagon::V6_vconv_hf_qf16)) &&
+             !DefMI2->isPHI() &&
+             (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+
+    if (Inst1 && MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+                     &Hexagon::HvxWRRegClass)
+      return false;
+
+    MachineOperand &Src1 = DefMI1->getOperand(1);
+    MachineOperand &Src2 = MI->getOperand(2);
+
+    Op0F = getKillRegState(Src1.isKill());
+    Src1.setIsKill(false);
+    Op1F = getKillRegState(Src2.isKill());
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+              .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+
+    // Case 2: Left operand is conversion to sf/hf
+  } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
+               Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+              (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
+               Def2OP == Hexagon::V6_vconv_hf_qf16)) &&
+             !DefMI1->isPHI() &&
+             (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+    // The second operand of original instruction is converted.
+    // In "mix" instructions, "qf" operand is always the first operand.
+
+    // Caveat: vsub is not commutative w.r.t operands.
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix)
+      return false;
+
+    if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+                     &Hexagon::HvxWRRegClass)
+      return false;
+
+    MachineOperand &Src1 = MI->getOperand(1);
+    MachineOperand &Src2 = DefMI2->getOperand(1);
+
+    Op1F = getKillRegState(Src2.isKill());
+    Src2.setIsKill(false);
+    Op0F = getKillRegState(Src1.isKill());
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src2.getReg(), Op1F,
+                      Src2.getSubReg()) // Notice the operands are flipped.
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+
+  return false;
+}
+
+bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
+
+  bool Changed = false;
+
+  if (DisableQFOptimizer)
+    return Changed;
+
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  if (!HST->useHVXV68Ops() || !HST->usePackets() ||
+      skipFunction(MF.getFunction()))
+    return false;
+  HII = HST->getInstrInfo();
+  MRI = &MF.getRegInfo();
+
+  MachineFunction::iterator MBBI = MF.begin();
+  LLVM_DEBUG(dbgs() << "\n=== Running QFPOptimzer Pass for : " << MF.getName()
+                    << " Optimize intermediate conversions ===\n");
+  while (MBBI != MF.end()) {
+    MachineBasicBlock *MBB = &*MBBI;
+    MachineBasicBlock::iterator MII = MBBI->instr_begin();
+    while (MII != MBBI->instr_end()) {
+      MachineInstr *MI = &*MII;
+      ++MII; // As MI might be removed.
+
+      if (QFPInstMap.count(MI->getOpcode()) &&
+          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
+          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
+        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+        if (optimizeQfp(MI, MBB)) {
+          MI->eraseFromParent();
+          LLVM_DEBUG(dbgs() << "\t....Removing....");
+          Changed = true;
+        }
+      }
+    }
+    ++MBBI;
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index f5d8b69..d9824a31 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -220,6 +220,7 @@ LLVMInitializeHexagonTarget() {
   initializeHexagonPeepholePass(PR);
   initializeHexagonSplitConst32AndConst64Pass(PR);
   initializeHexagonVectorPrintPass(PR);
+  initializeHexagonQFPOptimizerPass(PR);
 }
 
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
@@ -386,6 +387,7 @@ bool HexagonPassConfig::addInstSelector() {
       addPass(createHexagonGenInsert());
     if (EnableEarlyIf)
       addPass(createHexagonEarlyIfConversion());
+    addPass(createHexagonQFPOptimizer());
   }
 
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8bf0d11..d477522 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
-        Subtarget.hasFRE()))
+  if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
   if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
-        Subtarget.hasFRES()))
+      !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
   if (Subtarget.hasFCPSGN()) {
@@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i64, Legal);
     setOperationAction(ISD::BITCAST, MVT::f64, Legal);
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::LRINT, MVT::f64, Legal);
-      setOperationAction(ISD::LRINT, MVT::f32, Legal);
-      setOperationAction(ISD::LLRINT, MVT::f64, Legal);
-      setOperationAction(ISD::LLRINT, MVT::f32, Legal);
-      setOperationAction(ISD::LROUND, MVT::f64, Legal);
-      setOperationAction(ISD::LROUND, MVT::f32, Legal);
-      setOperationAction(ISD::LLROUND, MVT::f64, Legal);
-      setOperationAction(ISD::LLROUND, MVT::f32, Legal);
-    }
+
+    setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom);
   } else {
     setOperationAction(ISD::BITCAST, MVT::f32, Expand);
     setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
       // The nearbyint variants are not allowed to raise the inexact exception
-      // so we can only code-gen them with unsafe math.
-      if (TM.Options.UnsafeFPMath) {
-        setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-        setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-      }
+      // so we can only code-gen them with fpexcept.ignore.
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom);
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom);
 
       setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // be lost at this stage, but is below the single-precision rounding
     // position.
     //
-    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // However, if afn is in effect, accept double
     // rounding to avoid the extra overhead.
-    if (Op.getValueType() == MVT::f32 &&
-        !Subtarget.hasFPCVT() &&
-        !DAG.getTarget().Options.UnsafeFPMath) {
+    // FIXME: Currently INT_TO_FP can't support fast math flags because
+    // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
+    // false.
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
+        !Op->getFlags().hasApproximateFuncs()) {
 
       // Twiddle input to make sure the low 11 bits are zero.  (If this
       // is the case, we are guaranteed the value will fit into the 53 bit
@@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerADDSUBO_CARRY(Op, DAG);
   case ISD::UCMP:
     return LowerUCMP(Op, DAG);
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT:
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+  case ISD::STRICT_FNEARBYINT:
+    if (Op->getFlags().hasNoFPExcept())
+      return Op;
+    return SDValue();
   }
 }
 
@@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
   BuildMI(BB, dl, TII->get(StoreMnemonic))
     .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
   BuildMI(BB, dl, TII->get(PPC::BCC))
-    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+      .addImm(PPC::PRED_NE_MINUS)
+      .addReg(PPC::CR0)
+      .addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
 
@@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
       .addReg(ZeroReg)
       .addReg(PtrReg);
   BuildMI(BB, dl, TII->get(PPC::BCC))
-      .addImm(PPC::PRED_NE)
+      .addImm(PPC::PRED_NE_MINUS)
       .addReg(PPC::CR0)
       .addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
@@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(dest)
         .addReg(oldval);
     BuildMI(BB, dl, TII->get(PPC::BCC))
-        .addImm(PPC::PRED_NE)
+        .addImm(PPC::PRED_NE_MINUS)
         .addReg(CrReg)
         .addMBB(exitMBB);
     BB->addSuccessor(loop2MBB);
@@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(ptrA)
         .addReg(ptrB);
     BuildMI(BB, dl, TII->get(PPC::BCC))
-        .addImm(PPC::PRED_NE)
+        .addImm(PPC::PRED_NE_MINUS)
         .addReg(PPC::CR0)
         .addMBB(loop1MBB);
     BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
@@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
     const Function *F = I->getFunction();
     const DataLayout &DL = F->getDataLayout();
     Type *Ty = User->getOperand(0)->getType();
+    bool AllowContract = I->getFastMathFlags().allowContract() &&
+                         User->getFastMathFlags().allowContract();
 
-    return !(
-        isFMAFasterThanFMulAndFAdd(*F, Ty) &&
-        isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
-        (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+    return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+             isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+             (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
   }
   case Instruction::Load: {
     // Don't break "store (load float*)" pattern, this pattern will be combined
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 979ba31..885bed6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
 // Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
 // these need to be defined after the any_frint versions so ISEL will correctly
 // add the chain to the strict versions.
-def : Pat<(f32 (fnearbyint f32:$S)),
+// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when
+// rounding mode is propagated to CodeGen part.
+def : Pat<(f32 (strict_fnearbyint f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIC
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f64 (fnearbyint f64:$S)),
+def : Pat<(f64 (strict_fnearbyint f64:$S)),
           (f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)),
           (v2f64 (XVRDPIC $S))>;
-def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)),
           (v4f32 (XVRSPIC $S))>;
 
 // Materialize a zero-vector of long long
@@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)),
           (f64 (MTVSRD $S))>;
 
 // Rounding to integer.
-def : Pat<(i64 (lrint f64:$S)),
+def : Pat<(i64 (strict_lrint f64:$S)),
           (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (lrint f32:$S)),
+def : Pat<(i64 (strict_lrint f32:$S)),
           (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (llrint f64:$S)),
+def : Pat<(i64 (strict_llrint f64:$S)),
           (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (llrint f32:$S)),
+def : Pat<(i64 (strict_llrint f32:$S)),
           (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (lround f64:$S)),
+def : Pat<(i64 (strict_lround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (lround f32:$S)),
+def : Pat<(i64 (strict_lround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i32 (lround f64:$S)),
+def : Pat<(i32 (strict_lround f64:$S)),
           (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
-def : Pat<(i32 (lround f32:$S)),
+def : Pat<(i32 (strict_lround f32:$S)),
           (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i64 (llround f64:$S)),
+def : Pat<(i64 (strict_llround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (llround f32:$S)),
+def : Pat<(i64 (strict_llround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
 
 // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index e857b2d..edde7ac 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
 }
 
 bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) {
-  if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa))
+  if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+      STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))
     return Error(
         ErrorLoc,
         "operand must be "
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b8ec0bb..4bea4c4 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = {
 static constexpr FeatureBitset XSfVectorGroup = {
     RISCV::FeatureVendorXSfvcp,          RISCV::FeatureVendorXSfvqmaccdod,
     RISCV::FeatureVendorXSfvqmaccqoq,    RISCV::FeatureVendorXSfvfwmaccqqq,
-    RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase};
+    RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase,
+    RISCV::FeatureVendorXSfvfexpa,       RISCV::FeatureVendorXSfvfexpa64e,
+    RISCV::FeatureVendorXSfvfbfexp16e,   RISCV::FeatureVendorXSfvfexp16e,
+    RISCV::FeatureVendorXSfvfexp32e};
 static constexpr FeatureBitset XSfSystemGroup = {
     RISCV::FeatureVendorXSiFivecdiscarddlone,
     RISCV::FeatureVendorXSiFivecflushdlone,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 50f5a5d..7b9c4b3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
   if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
       RISCVVType::getSEW(Imm) > 64 ||
       (RISCVVType::isAltFmt(Imm) &&
-       !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) ||
+       !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+         STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) ||
       (Imm >> 9) != 0) {
     O << formatImm(Imm);
     return;
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 5dd4bf4..98b636e 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -109,12 +109,70 @@ bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB,
   // expanded instructions for each pseudo is correct in the Size field of the
   // tablegen definition for the pseudo.
   switch (MBBI->getOpcode()) {
+  case RISCV::PseudoAtomicSwap32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicSwap64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAdd32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAdd64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadSub32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadSub64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAnd32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAnd64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadOr32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 32, NextMBBI);
+  case RISCV::PseudoAtomicLoadOr64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 64, NextMBBI);
+  case RISCV::PseudoAtomicLoadXor32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadXor64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 64,
+                             NextMBBI);
   case RISCV::PseudoAtomicLoadNand32:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
                              NextMBBI);
   case RISCV::PseudoAtomicLoadNand64:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
                              NextMBBI);
+  case RISCV::PseudoAtomicLoadMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMin64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMax64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMin64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMax64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 64,
+                                NextMBBI);
   case RISCV::PseudoMaskedAtomicSwap32:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
                              NextMBBI);
@@ -277,6 +335,36 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
   switch (BinOp) {
   default:
     llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Xchg:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+        .addReg(IncrReg)
+        .addImm(0);
+    break;
+  case AtomicRMWInst::Add:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Sub:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::And:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Or:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::OR), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Xor:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::XOR), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
   case AtomicRMWInst::Nand:
     BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
         .addReg(DestReg)
@@ -433,38 +521,85 @@ static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
       .addReg(ShamtReg);
 }
 
-bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
-    MachineBasicBlock::iterator &NextMBBI) {
-  assert(IsMasked == true &&
-         "Should only need to expand masked atomic max/min");
-  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
+static void doAtomicMinMaxOpExpansion(
+    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+    MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+    const RISCVSubtarget *STI) {
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
 
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  MachineFunction *MF = MBB.getParent();
-  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  // .loophead:
+  //   lr.[w|d] dest, (addr)
+  //   mv scratch, dest
+  //   ifnochangeneeded scratch, incr, .looptail
+  BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width, STI)), DestReg)
+      .addReg(AddrReg);
+  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+      .addReg(DestReg)
+      .addImm(0);
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Max: {
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(ScratchReg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::Min: {
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(IncrReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::UMax:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(ScratchReg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  case AtomicRMWInst::UMin:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(IncrReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
 
-  // Insert new MBBs.
-  MF->insert(++MBB.getIterator(), LoopHeadMBB);
-  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
-  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
-  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+  // .loopifbody:
+  //   mv scratch, incr
+  BuildMI(LoopIfBodyMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+      .addReg(IncrReg)
+      .addImm(0);
 
-  // Set up successors and transfer remaining instructions to DoneMBB.
-  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
-  LoopHeadMBB->addSuccessor(LoopTailMBB);
-  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
-  LoopTailMBB->addSuccessor(LoopHeadMBB);
-  LoopTailMBB->addSuccessor(DoneMBB);
-  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
-  DoneMBB->transferSuccessors(&MBB);
-  MBB.addSuccessor(LoopHeadMBB);
+  // .looptail:
+  //   sc.[w|d] scratch, scratch, (addr)
+  //   bnez scratch, loop
+  BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
+          ScratchReg)
+      .addReg(ScratchReg)
+      .addReg(AddrReg);
+  BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+      .addReg(ScratchReg)
+      .addReg(RISCV::X0)
+      .addMBB(LoopHeadMBB);
+}
 
+static void doMaskedAtomicMinMaxOpExpansion(
+    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+    MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+    const RISCVSubtarget *STI) {
+  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
   Register DestReg = MI.getOperand(0).getReg();
   Register Scratch1Reg = MI.getOperand(1).getReg();
   Register Scratch2Reg = MI.getOperand(2).getReg();
@@ -541,6 +676,44 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
       .addReg(Scratch1Reg)
       .addReg(RISCV::X0)
       .addMBB(LoopHeadMBB);
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
+    MachineBasicBlock::iterator &NextMBBI) {
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopHeadMBB);
+  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
+  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
+  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
+  LoopHeadMBB->addSuccessor(LoopTailMBB);
+  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
+  LoopTailMBB->addSuccessor(LoopHeadMBB);
+  LoopTailMBB->addSuccessor(DoneMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopHeadMBB);
+
+  if (!IsMasked)
+    doAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, LoopIfBodyMBB,
+                              LoopTailMBB, DoneMBB, BinOp, Width, STI);
+  else
+    doMaskedAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB,
+                                    LoopIfBodyMBB, LoopTailMBB, DoneMBB, BinOp,
+                                    Width, STI);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 19992e6..9e6b7f0 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -218,6 +218,7 @@ def HasStdExtZaamo
     : Predicate<"Subtarget->hasStdExtZaamo()">,
       AssemblerPredicate<(any_of FeatureStdExtZaamo),
                          "'Zaamo' (Atomic Memory Operations)">;
+def NoStdExtZaamo : Predicate<"!Subtarget->hasStdExtZaamo()">;
 
 def FeatureStdExtZalrsc
     : RISCVExtension<1, 0, "Load-Reserved/Store-Conditional">;
@@ -1334,6 +1335,44 @@ def HasVendorXSfvfnrclipxfqf
       AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
                          "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
 
+// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in
+// TableGen. Instead, we check that in RISCVISAInfo.
+def FeatureVendorXSfvfbfexp16e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">;
+def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">;
+
+def FeatureVendorXSfvfexp16e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision",
+                     [FeatureStdExtZvfh]>;
+def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">;
+
+def FeatureVendorXSfvfexp32e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">;
+
+def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">;
+def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">,
+                           AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e),
+                           "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">;
+
+def FeatureVendorXSfvfexpa
+    : RISCVExtension<0, 2,
+                     "SiFive Vector Floating-Point Exponential Approximation Instruction",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">,
+                         AssemblerPredicate<(all_of FeatureVendorXSfvfexpa),
+                         "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">;
+
+def FeatureVendorXSfvfexpa64e
+    : RISCVExtension<0, 2,
+                     "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision",
+                     [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>;
+def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">;
+
 def FeatureVendorXSiFivecdiscarddlone
     : RISCVExtension<1, 0,
                      "SiFive sf.cdiscard.d.l1 Instruction", []>;
@@ -1864,7 +1903,7 @@ def FeatureForcedAtomics : SubtargetFeature<
     "forced-atomics", "HasForcedAtomics", "true",
     "Assume that lock-free native-width atomics are available">;
 def HasAtomicLdSt
-    : Predicate<"Subtarget->hasStdExtA() || Subtarget->hasForcedAtomics()">;
+    : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
 
 def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 169465e..26fe9ed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   else if (Subtarget.hasStdExtZicbop())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
 
-  if (Subtarget.hasStdExtA()) {
+  if (Subtarget.hasStdExtZalrsc()) {
     setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
     if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas())
       setMinCmpXchgSizeInBits(8);
@@ -1558,7 +1558,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasStdExtA())
+  if (Subtarget.hasStdExtZaamo())
     setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand);
 
   if (Subtarget.hasForcedAtomics()) {
@@ -12649,10 +12649,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
       Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, LoVT, Lo);
       Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi);
       // Reassemble the low and high pieces reversed.
-      // FIXME: This is a CONCAT_VECTORS.
-      SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0);
-      return DAG.getInsertSubvector(DL, Res, Lo,
-                                    LoVT.getVectorMinNumElements());
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Hi, Lo);
     }
 
     // Just promote the int type to i16 which will double the LMUL.
@@ -21878,7 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
       // result is then sign extended to XLEN. With +A, the minimum width is
       // 32 for both 64 and 32.
       assert(getMinCmpXchgSizeInBits() == 32);
-      assert(Subtarget.hasStdExtA());
+      assert(Subtarget.hasStdExtZalrsc());
       return Op.getValueSizeInBits() - 31;
     }
     break;
@@ -24047,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     }
   }
 
-  std::pair<Register, const TargetRegisterClass *> Res =
-      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
-  // If we picked one of the Zfinx register classes, remap it to the GPR class.
-  // FIXME: When Zfinx is supported in CodeGen this will need to take the
-  // Subtarget into account.
-  if (Res.second == &RISCV::GPRF16RegClass ||
-      Res.second == &RISCV::GPRF32RegClass ||
-      Res.second == &RISCV::GPRPairRegClass)
-    return std::make_pair(Res.first, &RISCV::GPRRegClass);
-
-  return Res;
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 InlineAsm::ConstraintCode
@@ -24485,6 +24471,25 @@ ISD::NodeType RISCVTargetLowering::getExtendForAtomicCmpSwapArg() const {
   return Subtarget.hasStdExtZacas() ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
 }
 
+ISD::NodeType RISCVTargetLowering::getExtendForAtomicRMWArg(unsigned Op) const {
+  // Zaamo will use amo<op>.w which does not require extension.
+  if (Subtarget.hasStdExtZaamo() || Subtarget.hasForcedAtomics())
+    return ISD::ANY_EXTEND;
+
+  // Zalrsc pseudo expansions with comparison require sign-extension.
+  assert(Subtarget.hasStdExtZalrsc());
+  switch (Op) {
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+    return ISD::SIGN_EXTEND;
+  default:
+    break;
+  }
+  return ISD::ANY_EXTEND;
+}
+
 Register RISCVTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X10;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3f81ed7..9e3e2a9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -245,6 +245,7 @@ public:
   }
 
   ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
+  ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const override;
 
   bool shouldTransformSignedTruncationCheck(EVT XVT,
                                             unsigned KeptBits) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 12f776b..912b82d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
 // instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END.
 // TODO: Support more operations.
 unsigned getPredicatedOpcode(unsigned Opcode) {
+  // clang-format off
   switch (Opcode) {
-  case RISCV::ADD:   return RISCV::PseudoCCADD;   break;
-  case RISCV::SUB:   return RISCV::PseudoCCSUB;   break;
-  case RISCV::SLL:   return RISCV::PseudoCCSLL;   break;
-  case RISCV::SRL:   return RISCV::PseudoCCSRL;   break;
-  case RISCV::SRA:   return RISCV::PseudoCCSRA;   break;
-  case RISCV::AND:   return RISCV::PseudoCCAND;   break;
-  case RISCV::OR:    return RISCV::PseudoCCOR;    break;
-  case RISCV::XOR:   return RISCV::PseudoCCXOR;   break;
-
-  case RISCV::ADDI:  return RISCV::PseudoCCADDI;  break;
-  case RISCV::SLLI:  return RISCV::PseudoCCSLLI;  break;
-  case RISCV::SRLI:  return RISCV::PseudoCCSRLI;  break;
-  case RISCV::SRAI:  return RISCV::PseudoCCSRAI;  break;
-  case RISCV::ANDI:  return RISCV::PseudoCCANDI;  break;
-  case RISCV::ORI:   return RISCV::PseudoCCORI;   break;
-  case RISCV::XORI:  return RISCV::PseudoCCXORI;  break;
-
-  case RISCV::ADDW:  return RISCV::PseudoCCADDW;  break;
-  case RISCV::SUBW:  return RISCV::PseudoCCSUBW;  break;
-  case RISCV::SLLW:  return RISCV::PseudoCCSLLW;  break;
-  case RISCV::SRLW:  return RISCV::PseudoCCSRLW;  break;
-  case RISCV::SRAW:  return RISCV::PseudoCCSRAW;  break;
-
-  case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break;
-  case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break;
-  case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break;
-  case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break;
-
-  case RISCV::ANDN:  return RISCV::PseudoCCANDN;  break;
-  case RISCV::ORN:   return RISCV::PseudoCCORN;   break;
-  case RISCV::XNOR:  return RISCV::PseudoCCXNOR;  break;
-
-  case RISCV::NDS_BFOS:  return RISCV::PseudoCCNDS_BFOS;  break;
-  case RISCV::NDS_BFOZ:  return RISCV::PseudoCCNDS_BFOZ;  break;
+  case RISCV::ADD:   return RISCV::PseudoCCADD;
+  case RISCV::SUB:   return RISCV::PseudoCCSUB;
+  case RISCV::SLL:   return RISCV::PseudoCCSLL;
+  case RISCV::SRL:   return RISCV::PseudoCCSRL;
+  case RISCV::SRA:   return RISCV::PseudoCCSRA;
+  case RISCV::AND:   return RISCV::PseudoCCAND;
+  case RISCV::OR:    return RISCV::PseudoCCOR;
+  case RISCV::XOR:   return RISCV::PseudoCCXOR;
+
+  case RISCV::ADDI:  return RISCV::PseudoCCADDI;
+  case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
+  case RISCV::SRLI:  return RISCV::PseudoCCSRLI;
+  case RISCV::SRAI:  return RISCV::PseudoCCSRAI;
+  case RISCV::ANDI:  return RISCV::PseudoCCANDI;
+  case RISCV::ORI:   return RISCV::PseudoCCORI;
+  case RISCV::XORI:  return RISCV::PseudoCCXORI;
+
+  case RISCV::ADDW:  return RISCV::PseudoCCADDW;
+  case RISCV::SUBW:  return RISCV::PseudoCCSUBW;
+  case RISCV::SLLW:  return RISCV::PseudoCCSLLW;
+  case RISCV::SRLW:  return RISCV::PseudoCCSRLW;
+  case RISCV::SRAW:  return RISCV::PseudoCCSRAW;
+
+  case RISCV::ADDIW: return RISCV::PseudoCCADDIW;
+  case RISCV::SLLIW: return RISCV::PseudoCCSLLIW;
+  case RISCV::SRLIW: return RISCV::PseudoCCSRLIW;
+  case RISCV::SRAIW: return RISCV::PseudoCCSRAIW;
+
+  case RISCV::ANDN:  return RISCV::PseudoCCANDN;
+  case RISCV::ORN:   return RISCV::PseudoCCORN;
+  case RISCV::XNOR:  return RISCV::PseudoCCXNOR;
+
+  case RISCV::NDS_BFOS:  return RISCV::PseudoCCNDS_BFOS;
+  case RISCV::NDS_BFOZ:  return RISCV::PseudoCCNDS_BFOZ;
   }
+  // clang-format on
 
   return RISCV::INSTRUCTION_LIST_END;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 571d72f..5c81a09 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -158,9 +158,9 @@ class seq_cst_store<PatFrag base>
 }
 } // IsAtomic = 1
 
-// Atomic load/store are available under both +a and +force-atomics.
-// Fences will be inserted for atomic load/stores according to the logic in
-// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+// Atomic load/store are available under +zalrsc (thus also +a) and
+// +force-atomics.  Fences will be inserted for atomic load/stores according to
+// the logic in RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
 // The normal loads/stores are relaxed (unordered) loads/stores that don't have
 // any ordering. This is necessary because AtomicExpandPass has added fences to
 // atomic load/stores and changed them to unordered ones.
@@ -308,7 +308,65 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
           (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
            timm:$ordering)>;
 
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap32 : PseudoAMO;
+def PseudoAtomicLoadAdd32 : PseudoAMO;
+def PseudoAtomicLoadSub32 : PseudoAMO;
+def PseudoAtomicLoadAnd32 : PseudoAMO;
+def PseudoAtomicLoadOr32 : PseudoAMO;
+def PseudoAtomicLoadXor32 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax32 : PseudoAMO;
+def PseudoAtomicLoadMin32 : PseudoAMO;
+def PseudoAtomicLoadUMax32 : PseudoAMO;
+def PseudoAtomicLoadUMin32 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i32", PseudoAtomicSwap32>;
+defm : PseudoAMOPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>;
+defm : PseudoAMOPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>;
+defm : PseudoAMOPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>;
+defm : PseudoAMOPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>;
+defm : PseudoAMOPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>;
+defm : PseudoAMOPat<"atomic_load_max_i32", PseudoAtomicLoadMax32>;
+defm : PseudoAMOPat<"atomic_load_min_i32", PseudoAtomicLoadMin32>;
+defm : PseudoAMOPat<"atomic_load_umax_i32", PseudoAtomicLoadUMax32>;
+defm : PseudoAMOPat<"atomic_load_umin_i32", PseudoAtomicLoadUMin32>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo]
+
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap64 : PseudoAMO;
+def PseudoAtomicLoadAdd64 : PseudoAMO;
+def PseudoAtomicLoadSub64 : PseudoAMO;
+def PseudoAtomicLoadAnd64 : PseudoAMO;
+def PseudoAtomicLoadOr64 : PseudoAMO;
+def PseudoAtomicLoadXor64 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax64 : PseudoAMO;
+def PseudoAtomicLoadMin64 : PseudoAMO;
+def PseudoAtomicLoadUMax64 : PseudoAMO;
+def PseudoAtomicLoadUMin64 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i64", PseudoAtomicSwap64, i64>;
+defm : PseudoAMOPat<"atomic_load_add_i64", PseudoAtomicLoadAdd64, i64>;
+defm : PseudoAMOPat<"atomic_load_sub_i64", PseudoAtomicLoadSub64, i64>;
+defm : PseudoAMOPat<"atomic_load_and_i64", PseudoAtomicLoadAnd64, i64>;
+defm : PseudoAMOPat<"atomic_load_or_i64", PseudoAtomicLoadOr64, i64>;
+defm : PseudoAMOPat<"atomic_load_xor_i64", PseudoAtomicLoadXor64, i64>;
+defm : PseudoAMOPat<"atomic_load_max_i64", PseudoAtomicLoadMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_min_i64", PseudoAtomicLoadMin64, i64>;
+defm : PseudoAMOPat<"atomic_load_umax_i64", PseudoAtomicLoadUMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_umin_i64", PseudoAtomicLoadUMin64, i64>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64]
+
+let Predicates = [HasStdExtZalrsc] in {
 
 let Size = 20 in
 def PseudoAtomicLoadNand32 : PseudoAMO;
@@ -347,14 +405,14 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax,
                          PseudoMaskedAtomicLoadUMax32>;
 def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin,
                          PseudoMaskedAtomicLoadUMin32>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
 
-let Predicates = [HasStdExtA, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, IsRV64] in {
 
 let Size = 20 in
 def PseudoAtomicLoadNand64 : PseudoAMO;
 defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>;
-} // Predicates = [HasStdExtA, IsRV64]
+} // Predicates = [HasStdExtZalrsc, IsRV64]
 
 
 /// Compare and exchange
@@ -385,17 +443,17 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
 }
 
-let Predicates = [HasStdExtA, NoStdExtZacas] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas] in {
 def PseudoCmpXchg32 : PseudoCmpXchg;
 defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>;
 }
 
-let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas, IsRV64] in {
 def PseudoCmpXchg64 : PseudoCmpXchg;
 defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>;
 }
 
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc] in {
 def PseudoMaskedCmpXchg32
     : Pseudo<(outs GPR:$res, GPR:$scratch),
              (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
@@ -412,4 +470,4 @@ def : Pat<(XLenVT (int_riscv_masked_cmpxchg
             (XLenVT GPR:$mask), (XLenVT timm:$ordering))),
           (PseudoMaskedCmpXchg32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 6a4119a..4104abd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -217,6 +217,14 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
   defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR,    VR,    FPR32>, Sched<[]>;
 }
 
+let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in {
+  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">;
+}
+
+let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
+  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">;
+}
+
 let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
     DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in {
   def SF_VQMACCU_2x8x2  : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 40d7341..62b7bcd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -527,8 +527,8 @@ def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
 let Predicates = [HasStdExtZbs] in {
 def : Pat<(XLenVT (and (not (shl 1, shiftMaskXLen:$rs2)), GPR:$rs1)),
           (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
-def : Pat<(XLenVT (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)),
-          (BCLR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(XLenVT (and (rotl -2, shiftMaskXLen:$rs2), GPR:$rs1)),
+          (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
 def : Pat<(XLenVT (or (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
           (BSET GPR:$rs1, shiftMaskXLen:$rs2)>;
 def : Pat<(XLenVT (xor (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 5591d9f..021353a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -355,9 +355,9 @@ private:
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
   bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
                         Register &ReadReg, MachineInstr &InsertionPoint) const;
-  bool generateImageRead(Register &ResVReg, const SPIRVType *ResType,
-                         Register ImageReg, Register IdxReg, DebugLoc Loc,
-                         MachineInstr &Pos) const;
+  bool generateImageReadOrFetch(Register &ResVReg, const SPIRVType *ResType,
+                                Register ImageReg, Register IdxReg,
+                                DebugLoc Loc, MachineInstr &Pos) const;
   bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const;
   bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue,
                               Register ResVReg, const SPIRVType *ResType,
@@ -1321,8 +1321,8 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg,
       }
 
       Register IdxReg = IntPtrDef->getOperand(3).getReg();
-      return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg,
-                               I.getDebugLoc(), I);
+      return generateImageReadOrFetch(ResVReg, ResType, NewHandleReg, IdxReg,
+                                      I.getDebugLoc(), I);
     }
   }
 
@@ -3639,27 +3639,33 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic(
   DebugLoc Loc = I.getDebugLoc();
   MachineInstr &Pos = I;
 
-  return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos);
+  return generateImageReadOrFetch(ResVReg, ResType, NewImageReg, IdxReg, Loc,
+                                  Pos);
 }
 
-bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
-                                                 const SPIRVType *ResType,
-                                                 Register ImageReg,
-                                                 Register IdxReg, DebugLoc Loc,
-                                                 MachineInstr &Pos) const {
+bool SPIRVInstructionSelector::generateImageReadOrFetch(
+    Register &ResVReg, const SPIRVType *ResType, Register ImageReg,
+    Register IdxReg, DebugLoc Loc, MachineInstr &Pos) const {
   SPIRVType *ImageType = GR.getSPIRVTypeForVReg(ImageReg);
   assert(ImageType && ImageType->getOpcode() == SPIRV::OpTypeImage &&
          "ImageReg is not an image type.");
+
   bool IsSignedInteger =
       sampledTypeIsSignedInteger(GR.getTypeForSPIRVType(ImageType));
+  // Check if the "sampled" operand of the image type is 1.
+  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpImageFetch
+  auto SampledOp = ImageType->getOperand(6);
+  bool IsFetch = (SampledOp.getImm() == 1);
 
   uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType);
   if (ResultSize == 4) {
-    auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
-                   .addDef(ResVReg)
-                   .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(ImageReg)
-                   .addUse(IdxReg);
+    auto BMI =
+        BuildMI(*Pos.getParent(), Pos, Loc,
+                TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addUse(ImageReg)
+            .addUse(IdxReg);
 
     if (IsSignedInteger)
       BMI.addImm(0x1000); // SignExtend
@@ -3668,11 +3674,13 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
 
   SPIRVType *ReadType = widenTypeToVec4(ResType, Pos);
   Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
-  auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
-                 .addDef(ReadReg)
-                 .addUse(GR.getSPIRVTypeID(ReadType))
-                 .addUse(ImageReg)
-                 .addUse(IdxReg);
+  auto BMI =
+      BuildMI(*Pos.getParent(), Pos, Loc,
+              TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+          .addDef(ReadReg)
+          .addUse(GR.getSPIRVTypeID(ReadType))
+          .addUse(ImageReg)
+          .addUse(IdxReg);
   if (IsSignedInteger)
     BMI.addImm(0x1000); // SignExtend
   bool Succeed = BMI.constrainAllUses(TII, TRI, RBI);
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index cf85691..9bda8a4 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
     Options.X = F.getFnAttribute(Y).getValueAsBool();     \
   } while (0)
 
-  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee5..b54a1e7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
     return MinMax;
 
-  if (DAG.isKnownNeverNaN(NewX))
-    NewX = NewY;
-
-  SDValue IsNaN =
-      DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+  SDValue NaNSrc = IsNum ? MinMax : NewX;
+  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
 
   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
 }
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 7882045..0fce5b9 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -567,8 +567,8 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT) {
   default:
     if (TT.isOSNetBSD())
       return "apcs-gnu";
-    if (TT.isOSFreeBSD() || TT.isOSOpenBSD() || TT.isOSHaiku() ||
-        TT.isOHOSFamily())
+    if (TT.isOSFreeBSD() || TT.isOSFuchsia() || TT.isOSOpenBSD() ||
+        TT.isOSHaiku() || TT.isOHOSFamily())
       return "aapcs-linux";
     return "aapcs";
   }
@@ -648,6 +648,8 @@ StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) {
     }
   case llvm::Triple::OpenBSD:
     return "cortex-a8";
+  case llvm::Triple::Fuchsia:
+    return "cortex-a53";
   default:
     switch (Triple.getEnvironment()) {
     case llvm::Triple::EABIHF:
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 31126cc..f08a0c0 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -765,6 +765,12 @@ Error RISCVISAInfo::checkDependency() {
   if (HasZvl && !HasVector)
     return getExtensionRequiresError("zvl*b", "v' or 'zve*");
 
+  if (Exts.count("xsfvfbfexp16e") &&
+      !(Exts.count("zvfbfmin") || Exts.count("zvfbfa")))
+    return createStringError(errc::invalid_argument,
+                             "'xsfvfbfexp16e' requires 'zvfbfmin' or "
+                             "'zvfbfa' extension to also be specified");
+
   if (HasD && (HasC || Exts.count("zcd")))
     for (auto Ext : ZcdOverlaps)
       if (Exts.count(Ext.str()))
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 28ee444..a29faab 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
     FunctionImporter::ImportMapTy &ImportList) {
   for (const auto &GlobalList : Index) {
     // Ignore entries for undefined references.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
     auto GUID = GlobalList.first;
-    assert(GlobalList.second.SummaryList.size() == 1 &&
+    assert(GlobalList.second.getSummaryList().size() == 1 &&
            "Expected individual combined index to have one summary per GUID");
-    auto &Summary = GlobalList.second.SummaryList[0];
+    auto &Summary = GlobalList.second.getSummaryList()[0];
     // Skip the summaries for the importing module. These are included to
     // e.g. record required linkage changes.
     if (Summary->modulePath() == ModulePath)
@@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
 
 void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
   for (const auto &Entry : Index) {
-    for (const auto &S : Entry.second.SummaryList) {
+    for (const auto &S : Entry.second.getSummaryList()) {
       if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
         updateValueInfoForIndirectCalls(Index, FS);
     }
@@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
   // Add values flagged in the index as live roots to the worklist.
   for (const auto &Entry : Index) {
     auto VI = Index.getValueInfo(Entry);
-    for (const auto &S : Entry.second.SummaryList) {
+    for (const auto &S : Entry.second.getSummaryList()) {
       if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
         updateValueInfoForIndirectCalls(Index, FS);
       if (S->isLive()) {
@@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest(
   // is only enabled when testing importing via the 'opt' tool, which does
   // not do the ThinLink that would normally determine what values to promote.
   for (auto &I : *Index) {
-    for (auto &S : I.second.SummaryList) {
+    for (auto &S : I.second.getSummaryList()) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
     }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index be6cba3..aa1346d 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1271,7 +1271,7 @@ bool LowerTypeTestsModule::hasBranchTargetEnforcement() {
     // the module flags.
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("branch-target-enforcement")))
-      HasBranchTargetEnforcement = (BTE->getZExtValue() != 0);
+      HasBranchTargetEnforcement = !BTE->isZero();
     else
       HasBranchTargetEnforcement = 0;
   }
@@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() {
       // A set of all functions that are address taken by a live global object.
       DenseSet<GlobalValue::GUID> AddressTaken;
       for (auto &I : *ExportSummary)
-        for (auto &GVS : I.second.SummaryList)
+        for (auto &GVS : I.second.getSummaryList())
           if (GVS->isLive())
             for (const auto &Ref : GVS->refs()) {
               AddressTaken.insert(Ref.getGUID());
@@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() {
     }
 
     for (auto &P : *ExportSummary) {
-      for (auto &S : P.second.SummaryList) {
+      for (auto &S : P.second.getSummaryList()) {
         if (!ExportSummary->isGlobalValueLive(S.get()))
           continue;
         if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2d5cb82..76e588b 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex(
     // linker, as we have no information on their eventual use.
     if (DynamicExportSymbols.count(P.first))
       continue;
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
@@ -2413,7 +2413,7 @@ bool DevirtModule::run() {
     }
 
     for (auto &P : *ExportSummary) {
-      for (auto &S : P.second.SummaryList) {
+      for (auto &S : P.second.getSummaryList()) {
         auto *FS = dyn_cast<FunctionSummary>(S.get());
         if (!FS)
           continue;
@@ -2564,7 +2564,7 @@ void DevirtIndex::run() {
 
   // Collect information from summary about which calls to try to devirtualize.
   for (auto &P : ExportSummary) {
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *FS = dyn_cast<FunctionSummary>(S.get());
       if (!FS)
         continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index cdc559b..9b9fe26 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
 
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(APFloat F, const fltSemantics &Sem) {
   bool losesInfo;
-  APFloat F = CFP->getValueAPF();
   (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
   return !losesInfo;
 }
 
-static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
-  if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
-    return nullptr;  // No constant folding of this.
+static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F,
+                              bool PreferBFloat) {
   // See if the value can be truncated to bfloat and then reextended.
-  if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
-    return Type::getBFloatTy(CFP->getContext());
+  if (PreferBFloat && fitsInFPType(F, APFloat::BFloat()))
+    return Type::getBFloatTy(Ctx);
   // See if the value can be truncated to half and then reextended.
-  if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
-    return Type::getHalfTy(CFP->getContext());
+  if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf()))
+    return Type::getHalfTy(Ctx);
   // See if the value can be truncated to float and then reextended.
-  if (fitsInFPType(CFP, APFloat::IEEEsingle()))
-    return Type::getFloatTy(CFP->getContext());
-  if (CFP->getType()->isDoubleTy())
-    return nullptr;  // Won't shrink.
-  if (fitsInFPType(CFP, APFloat::IEEEdouble()))
-    return Type::getDoubleTy(CFP->getContext());
+  if (fitsInFPType(F, APFloat::IEEEsingle()))
+    return Type::getFloatTy(Ctx);
+  if (&F.getSemantics() == &APFloat::IEEEdouble())
+    return nullptr; // Won't shrink.
+  // See if the value can be truncated to double and then reextended.
+  if (fitsInFPType(F, APFloat::IEEEdouble()))
+    return Type::getDoubleTy(Ctx);
   // Don't try to shrink to various long double types.
   return nullptr;
 }
 
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
+  Type *Ty = CFP->getType();
+  if (Ty->getScalarType()->isPPC_FP128Ty())
+    return nullptr; // No constant folding of this.
+
+  Type *ShrinkTy =
+      shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat);
+  if (ShrinkTy)
+    if (auto *VecTy = dyn_cast<VectorType>(Ty))
+      ShrinkTy = VectorType::get(ShrinkTy, VecTy);
+
+  return ShrinkTy;
+}
+
 // Determine if this is a vector of ConstantFPs and if so, return the minimal
 // type we can safely truncate all elements to.
 static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
@@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
 
   // Try to shrink scalable and fixed splat vectors.
   if (auto *FPC = dyn_cast<Constant>(V))
-    if (isa<VectorType>(V->getType()))
+    if (auto *VTy = dyn_cast<VectorType>(V->getType()))
       if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue()))
         if (Type *T = shrinkFPConstant(Splat, PreferBFloat))
-          return T;
+          return VectorType::get(T, VTy);
 
   // Try to shrink a vector of FP constants. This returns nullptr on scalable
   // vectors
@@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Type *Ty = FPT.getType();
   auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
   if (BO && BO->hasOneUse()) {
-    Type *LHSMinType =
-        getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
-    Type *RHSMinType =
-        getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
+    bool PreferBFloat = Ty->getScalarType()->isBFloatTy();
+    Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat);
+    Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat);
     unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 975498f..5aa8de3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3455,27 +3455,45 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
   // select a, false, b -> select !a, b, false
   if (match(TrueVal, m_Specific(Zero))) {
     Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-    return SelectInst::Create(NotCond, FalseVal, Zero);
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        SelectInst::Create(NotCond, FalseVal, Zero, "", nullptr, MDFrom);
+    NewSI->swapProfMetadata();
+    return NewSI;
   }
   // select a, b, true -> select !a, true, b
   if (match(FalseVal, m_Specific(One))) {
     Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-    return SelectInst::Create(NotCond, One, TrueVal);
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        SelectInst::Create(NotCond, One, TrueVal, "", nullptr, MDFrom);
+    NewSI->swapProfMetadata();
+    return NewSI;
   }
 
   // DeMorgan in select form: !a && !b --> !(a || b)
   // select !a, !b, false --> not (select a, true, b)
   if (match(&SI, m_LogicalAnd(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
       (CondVal->hasOneUse() || TrueVal->hasOneUse()) &&
-      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
-    return BinaryOperator::CreateNot(Builder.CreateSelect(A, One, B));
+      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        cast<SelectInst>(Builder.CreateSelect(A, One, B, "", MDFrom));
+    NewSI->swapProfMetadata();
+    return BinaryOperator::CreateNot(NewSI);
+  }
 
   // DeMorgan in select form: !a || !b --> !(a && b)
   // select !a, true, !b --> not (select a, b, false)
   if (match(&SI, m_LogicalOr(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
       (CondVal->hasOneUse() || FalseVal->hasOneUse()) &&
-      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
-    return BinaryOperator::CreateNot(Builder.CreateSelect(A, B, Zero));
+      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        cast<SelectInst>(Builder.CreateSelect(A, B, Zero, "", MDFrom));
+    NewSI->swapProfMetadata();
+    return BinaryOperator::CreateNot(NewSI);
+  }
 
   // select (select a, true, b), true, b -> select a, true, b
   if (match(CondVal, m_Select(m_Value(A), m_One(), m_Value(B))) &&
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 40720ae..0873845 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -31,10 +31,12 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -53,29 +55,12 @@
 #include <variant>
 
 using namespace llvm;
+using TokenMode = AllocTokenMode;
 
 #define DEBUG_TYPE "alloc-token"
 
 namespace {
 
-//===--- Constants --------------------------------------------------------===//
-
-enum class TokenMode : unsigned {
-  /// Incrementally increasing token ID.
-  Increment = 0,
-
-  /// Simple mode that returns a statically-assigned random token ID.
-  Random = 1,
-
-  /// Token ID based on allocated type hash.
-  TypeHash = 2,
-
-  /// Token ID based on allocated type hash, where the top half ID-space is
-  /// reserved for types that contain pointers and the bottom half for types
-  /// that do not contain pointers.
-  TypeHashPointerSplit = 3,
-};
-
 //===--- Command-line options ---------------------------------------------===//
 
 cl::opt<TokenMode> ClMode(
@@ -131,7 +116,7 @@ cl::opt<uint64_t> ClFallbackToken(
 
 //===--- Statistics -------------------------------------------------------===//
 
-STATISTIC(NumFunctionsInstrumented, "Functions instrumented");
+STATISTIC(NumFunctionsModified, "Functions modified");
 STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
 
 //===----------------------------------------------------------------------===//
@@ -140,9 +125,19 @@ STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
 ///
 /// Expected format is: !{<type-name>, <contains-pointer>}
 MDNode *getAllocTokenMetadata(const CallBase &CB) {
-  MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
-  if (!Ret)
-    return nullptr;
+  MDNode *Ret = nullptr;
+  if (auto *II = dyn_cast<IntrinsicInst>(&CB);
+      II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+    auto *MDV = cast<MetadataAsValue>(II->getArgOperand(0));
+    Ret = cast<MDNode>(MDV->getMetadata());
+    // If the intrinsic has an empty MDNode, type inference failed.
+    if (Ret->getNumOperands() == 0)
+      return nullptr;
+  } else {
+    Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
+    if (!Ret)
+      return nullptr;
+  }
   assert(Ret->getNumOperands() == 2 && "bad !alloc_token");
   assert(isa<MDString>(Ret->getOperand(0)));
   assert(isa<ConstantAsMetadata>(Ret->getOperand(1)));
@@ -206,22 +201,19 @@ public:
   using ModeBase::ModeBase;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    const auto [N, H] = getHash(CB, ORE);
-    return N ? boundedToken(H) : H;
-  }
 
-protected:
-  std::pair<MDNode *, uint64_t> getHash(const CallBase &CB,
-                                        OptimizationRemarkEmitter &ORE) {
     if (MDNode *N = getAllocTokenMetadata(CB)) {
       MDString *S = cast<MDString>(N->getOperand(0));
-      return {N, getStableSipHash(S->getString())};
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens))
+        return *Token;
     }
     // Fallback.
     remarkNoMetadata(CB, ORE);
-    return {nullptr, ClFallbackToken};
+    return ClFallbackToken;
   }
 
+protected:
   /// Remark that there was no precise type information.
   static void remarkNoMetadata(const CallBase &CB,
                                OptimizationRemarkEmitter &ORE) {
@@ -242,20 +234,18 @@ public:
   using TypeHashMode::TypeHashMode;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    if (MaxTokens == 1)
-      return 0;
-    const uint64_t HalfTokens = MaxTokens / 2;
-    const auto [N, H] = getHash(CB, ORE);
-    if (!N) {
-      // Pick the fallback token (ClFallbackToken), which by default is 0,
-      // meaning it'll fall into the pointer-less bucket. Override by setting
-      // -alloc-token-fallback if that is the wrong choice.
-      return H;
+    if (MDNode *N = getAllocTokenMetadata(CB)) {
+      MDString *S = cast<MDString>(N->getOperand(0));
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata,
+                                     MaxTokens))
+        return *Token;
     }
-    uint64_t Hash = H % HalfTokens; // base hash
-    if (containsPointer(N))
-      Hash += HalfTokens;
-    return Hash;
+    // Pick the fallback token (ClFallbackToken), which by default is 0, meaning
+    // it'll fall into the pointer-less bucket. Override by setting
+    // -alloc-token-fallback if that is the wrong choice.
+    remarkNoMetadata(CB, ORE);
+    return ClFallbackToken;
   }
 };
 
@@ -315,6 +305,9 @@ private:
   FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID,
                                        LibFunc OriginalFunc);
 
+  /// Lower alloc_token_* intrinsics.
+  void replaceIntrinsicInst(IntrinsicInst *II, OptimizationRemarkEmitter &ORE);
+
   /// Return the token ID from metadata in the call.
   uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
     return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode);
@@ -336,21 +329,32 @@ bool AllocToken::instrumentFunction(Function &F) {
   // Do not apply any instrumentation for naked functions.
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
-  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
-    return false;
   // Don't touch available_externally functions, their actual body is elsewhere.
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
     return false;
-  // Only instrument functions that have the sanitize_alloc_token attribute.
-  if (!F.hasFnAttribute(Attribute::SanitizeAllocToken))
-    return false;
 
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
+  SmallVector<IntrinsicInst *, 4> IntrinsicInsts;
+
+  // Only instrument functions that have the sanitize_alloc_token attribute.
+  const bool InstrumentFunction =
+      F.hasFnAttribute(Attribute::SanitizeAllocToken) &&
+      !F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation);
 
   // Collect all allocation calls to avoid iterator invalidation.
   for (Instruction &I : instructions(F)) {
+    // Collect all alloc_token_* intrinsics.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I);
+        II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+      IntrinsicInsts.emplace_back(II);
+      continue;
+    }
+
+    if (!InstrumentFunction)
+      continue;
+
     auto *CB = dyn_cast<CallBase>(&I);
     if (!CB)
       continue;
@@ -359,11 +363,21 @@ bool AllocToken::instrumentFunction(Function &F) {
   }
 
   bool Modified = false;
-  for (auto &[CB, Func] : AllocCalls)
-    Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
 
-  if (Modified)
-    NumFunctionsInstrumented++;
+  if (!AllocCalls.empty()) {
+    for (auto &[CB, Func] : AllocCalls)
+      Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
+    if (Modified)
+      NumFunctionsModified++;
+  }
+
+  if (!IntrinsicInsts.empty()) {
+    for (auto *II : IntrinsicInsts)
+      replaceIntrinsicInst(II, ORE);
+    Modified = true;
+    NumFunctionsModified++;
+  }
+
   return Modified;
 }
 
@@ -381,7 +395,7 @@ AllocToken::shouldInstrumentCall(const CallBase &CB,
   if (TLI.getLibFunc(*Callee, Func)) {
     if (isInstrumentableLibFunc(Func, CB, TLI))
       return Func;
-  } else if (Options.Extended && getAllocTokenMetadata(CB)) {
+  } else if (Options.Extended && CB.getMetadata(LLVMContext::MD_alloc_token)) {
     return NotLibFunc;
   }
 
@@ -528,6 +542,16 @@ FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB,
   return TokenAlloc;
 }
 
+void AllocToken::replaceIntrinsicInst(IntrinsicInst *II,
+                                      OptimizationRemarkEmitter &ORE) {
+  assert(II->getIntrinsicID() == Intrinsic::alloc_token_id);
+
+  uint64_t TokenID = getToken(*II, ORE);
+  Value *V = ConstantInt::get(IntPtrTy, TokenID);
+  II->replaceAllUsesWith(V);
+  II->eraseFromParent();
+}
+
 } // namespace
 
 AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts)
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index d18c0d0..80e77e09 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -2020,7 +2020,6 @@ static void moveFastMathFlags(Function &F,
     F.removeFnAttr(attr);                                                      \
     FMF.set##setter();                                                         \
   }
-  MOVE_FLAG("unsafe-fp-math", Fast)
   MOVE_FLAG("no-infs-fp-math", NoInfs)
   MOVE_FLAG("no-nans-fp-math", NoNaNs)
   MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index febdc54..adf27be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1011,6 +1011,10 @@ public:
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+    // Truncs must truncate at most to their destination type.
+    if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
+        I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
+      return false;
     return VF.isVector() && MinBWs.contains(I) &&
            !isProfitableToScalarize(I, VF) &&
            !isScalarAfterVectorization(I, VF);
@@ -7227,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     return DenseMap<const SCEV *, Value *>();
   }
 
-  VPlanTransforms::narrowInterleaveGroups(
-      BestVPlan, BestVF,
-      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8198,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       if (CM.foldTailWithEVL())
         VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
                                  *Plan, CM.getMaxSafeElements());
+
+      if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
+        VPlans.push_back(std::move(P));
+
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3f18bd7..cdb9e7e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5577,62 +5577,79 @@ private:
           }
           // Decrement the unscheduled counter and insert to ready list if
           // ready.
-          auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
-                                        unsigned OpIdx) {
-            if (!ScheduleCopyableDataMap.empty()) {
-              const EdgeInfo EI = {UserTE, OpIdx};
-              if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
-                DecrUnsched(CD, /*IsControl=*/false);
-                return;
-              }
-            }
-            auto It = OperandsUses.find(I);
-            assert(It != OperandsUses.end() && "Operand not found");
-            if (It->second > 0) {
-              --It->getSecond();
-              assert(TotalOpCount > 0 && "No more operands to decrement");
-              --TotalOpCount;
-              if (ScheduleData *OpSD = getScheduleData(I))
-                DecrUnsched(OpSD, /*IsControl=*/false);
-            }
-          };
+          auto DecrUnschedForInst =
+              [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
+                  SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
+                      &Checked) {
+                if (!ScheduleCopyableDataMap.empty()) {
+                  const EdgeInfo EI = {UserTE, OpIdx};
+                  if (ScheduleCopyableData *CD =
+                          getScheduleCopyableData(EI, I)) {
+                    if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
+                      return;
+                    DecrUnsched(CD, /*IsControl=*/false);
+                    return;
+                  }
+                }
+                auto It = OperandsUses.find(I);
+                assert(It != OperandsUses.end() && "Operand not found");
+                if (It->second > 0) {
+                  --It->getSecond();
+                  assert(TotalOpCount > 0 && "No more operands to decrement");
+                  --TotalOpCount;
+                  if (ScheduleData *OpSD = getScheduleData(I)) {
+                    if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+                      return;
+                    DecrUnsched(OpSD, /*IsControl=*/false);
+                  }
+                }
+              };
 
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
             // Need to search for the lane since the tree entry can be
             // reordered.
-            int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
-                                     find(Bundle->getTreeEntry()->Scalars, In));
-            assert(Lane >= 0 && "Lane not set");
-            if (isa<StoreInst>(In) &&
-                !Bundle->getTreeEntry()->ReorderIndices.empty())
-              Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
-            assert(Lane < static_cast<int>(
-                              Bundle->getTreeEntry()->Scalars.size()) &&
-                   "Couldn't find extract lane");
-
-            // Since vectorization tree is being built recursively this
-            // assertion ensures that the tree entry has all operands set before
-            // reaching this code. Couple of exceptions known at the moment are
-            // extracts where their second (immediate) operand is not added.
-            // Since immediates do not affect scheduler behavior this is
-            // considered okay.
-            assert(In &&
-                   (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
-                    In->getNumOperands() ==
-                        Bundle->getTreeEntry()->getNumOperands() ||
-                    Bundle->getTreeEntry()->isCopyableElement(In)) &&
-                   "Missed TreeEntry operands?");
-
-            for (unsigned OpIdx :
-                 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
-              if (auto *I = dyn_cast<Instruction>(
-                      Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
-                LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): " << *I
-                                  << "\n");
-                DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
-              }
+            auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
+            do {
+              int Lane =
+                  std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
+              assert(Lane >= 0 && "Lane not set");
+              if (isa<StoreInst>(In) &&
+                  !Bundle->getTreeEntry()->ReorderIndices.empty())
+                Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+              assert(Lane < static_cast<int>(
+                                Bundle->getTreeEntry()->Scalars.size()) &&
+                     "Couldn't find extract lane");
+
+              // Since vectorization tree is being built recursively this
+              // assertion ensures that the tree entry has all operands set
+              // before reaching this code. Couple of exceptions known at the
+              // moment are extracts where their second (immediate) operand is
+              // not added. Since immediates do not affect scheduler behavior
+              // this is considered okay.
+              assert(In &&
+                     (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+                      In->getNumOperands() ==
+                          Bundle->getTreeEntry()->getNumOperands() ||
+                      Bundle->getTreeEntry()->isCopyableElement(In)) &&
+                     "Missed TreeEntry operands?");
+
+              for (unsigned OpIdx :
+                   seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+                if (auto *I = dyn_cast<Instruction>(
+                        Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+                  LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): "
+                                    << *I << "\n");
+                  DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+                }
+              // If parent node is schedulable, it will be handle correctly.
+              if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+                break;
+              It = std::find(std::next(It),
+                             Bundle->getTreeEntry()->Scalars.end(), In);
+            } while (It != Bundle->getTreeEntry()->Scalars.end());
           }
         } else {
           // If BundleMember is a stand-alone instruction, no operand reordering
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index d167009..c95c887 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,32 +217,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
-                            const VPDominatorTree &VPDT) {
-  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
-  if (!VPBB)
-    return false;
-
-  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
-  // VPBB as its entry, i.e., free of predecessors.
-  if (auto *R = VPBB->getParent())
-    return !R->isReplicator() && !VPBB->hasPredecessors();
-
-  // A header dominates its second predecessor (the latch), with the other
-  // predecessor being the preheader
-  return VPB->getPredecessors().size() == 2 &&
-         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
-}
-
-bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
-                           const VPDominatorTree &VPDT) {
-  // A latch has a header as its second successor, with its other successor
-  // leaving the loop. A preheader OTOH has a header as its first (and only)
-  // successor.
-  return VPB->getNumSuccessors() == 2 &&
-         VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
-}
-
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && It->isPhi())
@@ -768,8 +742,12 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
 
 VPRegionBlock *VPRegionBlock::clone() {
   const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
-  auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
-                                                   getName(), isReplicator());
+  VPlan &Plan = *getPlan();
+  VPRegionBlock *NewRegion =
+      isReplicator()
+          ? Plan.createReplicateRegion(NewEntry, NewExiting, getName())
+          : Plan.createLoopRegion(getName(), NewEntry, NewExiting);
+
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
@@ -1213,6 +1191,7 @@ VPlan *VPlan::duplicate() {
   }
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VF] = &NewPlan->VF;
+  Old2NewVPValues[&UF] = &NewPlan->UF;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fed04eb..167ba55 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4152,6 +4152,9 @@ class VPlan {
   /// Represents the vectorization factor of the loop.
   VPValue VF;
 
+  /// Represents the symbolic unroll factor of the loop.
+  VPValue UF;
+
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
@@ -4305,6 +4308,9 @@ public:
   VPValue &getVF() { return VF; };
   const VPValue &getVF() const { return VF; };
 
+  /// Returns the symbolic UF of the vector loop region.
+  VPValue &getSymbolicUF() { return UF; };
+
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
@@ -4314,6 +4320,12 @@ public:
 
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
+  /// Remove \p VF from the plan.
+  void removeVF(ElementCount VF) {
+    assert(hasVF(VF) && "tried to remove VF not present in plan");
+    VFs.remove(VF);
+  }
+
   void setVF(ElementCount VF) {
     assert(hasVF(VF) && "Cannot set VF not already in plan");
     VFs.clear();
@@ -4438,22 +4450,24 @@ public:
     return VPB;
   }
 
-  /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
-  /// IsReplicator is true, the region is a replicate region. The returned block
-  /// is owned by the VPlan and deleted once the VPlan is destroyed.
-  VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
-                                     const std::string &Name = "",
-                                     bool IsReplicator = false) {
-    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+  /// Create a new loop region with \p Name and entry and exiting blocks set
+  /// to \p Entry and \p Exiting respectively, if set. The returned block is
+  /// owned by the VPlan and deleted once the VPlan is destroyed.
+  VPRegionBlock *createLoopRegion(const std::string &Name = "",
+                                  VPBlockBase *Entry = nullptr,
+                                  VPBlockBase *Exiting = nullptr) {
+    auto *VPB = Entry ? new VPRegionBlock(Entry, Exiting, Name)
+                      : new VPRegionBlock(Name);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
 
-  /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
-  /// to nullptr. The returned block is owned by the VPlan and deleted once the
-  /// VPlan is destroyed.
-  VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
-    auto *VPB = new VPRegionBlock(Name);
+  /// Create a new replicate region with \p Entry, \p Exiting and \p Name. The
+  /// returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
+  VPRegionBlock *createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting,
+                                       const std::string &Name = "") {
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, true);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 332791a..65688a3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -406,7 +406,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
   // LatchExitVPB, taking care to preserve the original predecessor & successor
   // order of blocks. Set region entry and exiting after both HeaderVPB and
   // LatchVPBB have been disconnected from their predecessors/successors.
-  auto *R = Plan.createVPRegionBlock();
+  auto *R = Plan.createLoopRegion();
   VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
   VPBlockUtils::disconnectBlocks(LatchVPBB, R);
   VPBlockUtils::connectBlocks(PreheaderVPBB, R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e060e70..ff25ef5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -372,7 +372,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   auto *Exiting =
       Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   VPRegionBlock *Region =
-      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
+      Plan.createReplicateRegion(Entry, Exiting, RegionName);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -1478,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
   if (!Plan.getVectorLoopRegion())
     return false;
 
-  if (!Plan.getTripCount()->isLiveIn())
-    return false;
-  auto *TC = dyn_cast_if_present<ConstantInt>(
-      Plan.getTripCount()->getUnderlyingValue());
-  if (!TC || !BestVF.isFixed())
+  const APInt *TC;
+  if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
     return false;
 
   // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
@@ -1495,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
   };
   unsigned NewBitWidth =
-      ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
+      ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
 
   LLVMContext &Ctx = Plan.getContext();
   auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
@@ -2092,8 +2089,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
     // Recipes in replicate regions implicitly depend on predicate. If either
     // recipe is in a replicate region, only consider them equal if both have
     // the same parent.
-    const VPRegionBlock *RegionL = L->getParent()->getParent();
-    const VPRegionBlock *RegionR = R->getParent()->getParent();
+    const VPRegionBlock *RegionL = L->getRegion();
+    const VPRegionBlock *RegionR = R->getRegion();
     if (((RegionL && RegionL->isReplicator()) ||
          (RegionR && RegionR->isReplicator())) &&
         L->getParent() != R->getParent())
@@ -3867,8 +3864,7 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
         // required lanes implicitly.
         // TODO: Remove once replicate regions are unrolled completely.
         auto IsCandidateUnpackUser = [Def](VPUser *U) {
-          VPRegionBlock *ParentRegion =
-              cast<VPRecipeBase>(U)->getParent()->getParent();
+          VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
           return U->usesScalars(Def) &&
                  (!ParentRegion || !ParentRegion->isReplicator());
         };
@@ -3960,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   // used.
   // TODO: Assert that they aren't used.
 
+  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  Plan.getSymbolicUF().replaceAllUsesWith(UF);
+
   // If there are no users of the runtime VF, compute VFxUF by constant folding
   // the multiplication of VF and UF.
   if (VF.getNumUsers() == 0) {
@@ -3979,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -4047,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
   return false;
 }
 
-/// Returns true if \p IR is a full interleave group with factor and number of
-/// members both equal to \p VF. The interleave group must also access the full
-/// vector width \p VectorRegWidth.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
-                                         unsigned VF, VPTypeAnalysis &TypeInfo,
-                                         unsigned VectorRegWidth) {
+/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
+/// number of members both equal to VF. The interleave group must also access
+/// the full vector width.
+static std::optional<ElementCount> isConsecutiveInterleaveGroup(
+    VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
+    VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
   if (!InterleaveR)
-    return false;
+    return std::nullopt;
 
   Type *GroupElementTy = nullptr;
   if (InterleaveR->getStoredValues().empty()) {
@@ -4063,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return false;
+      return std::nullopt;
   } else {
     GroupElementTy =
         TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -4071,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return false;
+      return std::nullopt;
   }
 
-  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
-  auto IG = InterleaveR->getInterleaveGroup();
-  return IG->getFactor() == VF && IG->getNumMembers() == VF &&
-         GroupSize == VectorRegWidth;
+  auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
+    TypeSize Size = TTI.getRegisterBitWidth(
+        VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
+                     : TargetTransformInfo::RGK_ScalableVector);
+    assert(Size.isScalable() == VF.isScalable() &&
+           "if Size is scalable, VF must to and vice versa");
+    return Size.getKnownMinValue();
+  };
+
+  for (ElementCount VF : VFs) {
+    unsigned MinVal = VF.getKnownMinValue();
+    unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
+    auto IG = InterleaveR->getInterleaveGroup();
+    if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
+        GroupSize == GetVectorWidthForVF(VF))
+      return {VF};
+  }
+  return std::nullopt;
 }
 
 /// Returns true if \p VPValue is a narrow VPValue.
@@ -4088,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
   return RepR && RepR->isSingleScalar();
 }
 
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                             unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+                                        const TargetTransformInfo &TTI) {
+  using namespace llvm::VPlanPatternMatch;
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+
   if (!VectorLoop)
-    return;
+    return nullptr;
 
   VPTypeAnalysis TypeInfo(Plan);
-
-  unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
+  std::optional<ElementCount> VFToOptimize;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
       continue;
@@ -4111,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     //  * recipes writing to memory except interleave groups
     // Only support plans with a canonical induction phi.
     if (R.isPhi())
-      return;
+      return nullptr;
 
     auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
     if (R.mayWriteToMemory() && !InterleaveR)
-      return;
-
-    // Do not narrow interleave groups if there are VectorPointer recipes and
-    // the plan was unrolled. The recipe implicitly uses VF from
-    // VPTransformState.
-    // TODO: Remove restriction once the VF for the VectorPointer offset is
-    // modeled explicitly as operand.
-    if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
-      return;
+      return nullptr;
 
     // All other ops are allowed, but we reject uses that cannot be converted
     // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
-    // Bail out on non-consecutive interleave groups.
-    if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
-                                      VectorRegWidth))
-      return;
-
+    // Try to find a single VF, where all interleave groups are consecutive and
+    // saturate the full vector width. If we already have a candidate VF, check
+    // if it is applicable for the current InterleaveR, otherwise look for a
+    // suitable VF across the Plans VFs.
+    //
+    if (VFToOptimize) {
+      if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
+                                        TTI))
+        return nullptr;
+    } else {
+      if (auto VF = isConsecutiveInterleaveGroup(
+              InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
+        VFToOptimize = *VF;
+      else
+        return nullptr;
+    }
     // Skip read interleave groups.
     if (InterleaveR->getStoredValues().empty())
       continue;
@@ -4168,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
     if (!WideMember0)
-      return;
+      return nullptr;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
-        return;
+        return nullptr;
       if (any_of(enumerate(R->operands()),
                  [WideMember0, Idx = I](const auto &P) {
                    const auto &[OpIdx, OpV] = P;
                    return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
                  }))
-        return;
+        return nullptr;
     }
     StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
-    return;
+    return nullptr;
+
+  // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+  // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+  // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+  std::unique_ptr<VPlan> NewPlan;
+  if (size(Plan.vectorFactors()) != 1) {
+    NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+    Plan.setVF(*VFToOptimize);
+    NewPlan->removeVF(*VFToOptimize);
+  }
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4256,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
-  VPValue *UF = Plan.getOrAddLiveIn(
-      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
-  if (VF.isScalable()) {
+  VPValue *UF = &Plan.getSymbolicUF();
+  if (VFToOptimize->isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
         CanIV->getScalarType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4270,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   }
   removeDeadRecipes(Plan);
+  assert(none_of(*VectorLoop->getEntryBasicBlock(),
+                 IsaPred<VPVectorPointerRecipe>) &&
+         "All VPVectorPointerRecipes should have been removed");
+  return NewPlan;
 }
 
 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b..ca8d956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -341,14 +341,20 @@ struct VPlanTransforms {
   static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
                                                      ScalarEvolution &SE);
 
-  /// Try to convert a plan with interleave groups with VF elements to a plan
-  /// with the interleave groups replaced by wide loads and stores processing VF
-  /// elements, if all transformed interleave groups access the full vector
-  /// width (checked via \o VectorRegWidth). This effectively is a very simple
-  /// form of loop-aware SLP, where we use interleave groups to identify
-  /// candidates.
-  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                     unsigned VectorRegWidth);
+  /// Try to find a single VF among \p Plan's VFs for which all interleave
+  /// groups (with known minimum VF elements) can be replaced by wide loads and
+  /// stores processing VF elements, if all transformed interleave groups access
+  /// the full vector width (checked via the maximum vector register width). If
+  /// the transformation can be applied, the original \p Plan will be split in
+  /// 2:
+  ///  1. The original Plan with the single VF containing the optimized recipes
+  ///  using wide loads instead of interleave groups.
+  ///  2. A new clone which contains all VFs of Plan except the optimized VF.
+  ///
+  /// This effectively is a very simple form of loop-aware SLP, where we use
+  /// interleave groups to identify candidates.
+  static std::unique_ptr<VPlan>
+  narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
 
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 10801c0..32e4b88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "VPlanUtils.h"
 #include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
 #include "VPlanPatternMatch.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -253,3 +254,29 @@ vputils::getRecipesForUncountableExit(VPlan &Plan,
 
   return UncountableCondition;
 }
+
+bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
+                            const VPDominatorTree &VPDT) {
+  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+  if (!VPBB)
+    return false;
+
+  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
+  // VPBB as its entry, i.e., free of predecessors.
+  if (auto *R = VPBB->getParent())
+    return !R->isReplicator() && !VPBB->hasPredecessors();
+
+  // A header dominates its second predecessor (the latch), with the other
+  // predecessor being the preheader
+  return VPB->getPredecessors().size() == 2 &&
+         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
+}
+
+bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
+                           const VPDominatorTree &VPDT) {
+  // A latch has a header as its second successor, with its other successor
+  // leaving the loop. A preheader OTOH has a header as its first (and only)
+  // successor.
+  return VPB->getNumSuccessors() == 2 &&
+         VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
+}