100 files changed, 1757 insertions, 1314 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b744537..45c889c 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -329,6 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
 
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
+      CE->getOpcode() == Instruction::PtrToAddr ||
       CE->getOpcode() == Instruction::BitCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
                                       DSOEquiv);
@@ -1495,22 +1496,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   default:
     llvm_unreachable("Missing case");
   case Instruction::PtrToAddr:
-    // TODO: Add some of the ptrtoint folds here as well.
-    break;
   case Instruction::PtrToInt:
     if (auto *CE = dyn_cast<ConstantExpr>(C)) {
       Constant *FoldedValue = nullptr;
-      // If the input is a inttoptr, eliminate the pair.  This requires knowing
+      // If the input is an inttoptr, eliminate the pair.  This requires knowing
       // the width of a pointer, so it can't be done in ConstantExpr::getCast.
       if (CE->getOpcode() == Instruction::IntToPtr) {
-        // zext/trunc the inttoptr to pointer size.
-        FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0),
-                                              DL.getIntPtrType(CE->getType()),
+        // zext/trunc the inttoptr to pointer/address size.
+        Type *MidTy = Opcode == Instruction::PtrToInt
+                          ? DL.getAddressType(CE->getType())
+                          : DL.getIntPtrType(CE->getType());
+        FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy,
                                               /*IsSigned=*/false, DL);
       } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
         // If we have GEP, we can perform the following folds:
-        // (ptrtoint (gep null, x)) -> x
-        // (ptrtoint (gep (gep null, x), y) -> x + y, etc.
+        // (ptrtoint/ptrtoaddr (gep null, x)) -> x
+        // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc.
         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
         APInt BaseOffset(BitWidth, 0);
         auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets(
@@ -1518,7 +1519,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
         if (Base->isNullValue()) {
           FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
         } else {
-          // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+          // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V))
+          //   -> sub (ptrtoint/ptrtoaddr Ptr), V
           if (GEP->getNumIndices() == 1 &&
               GEP->getSourceElementType()->isIntegerTy(8)) {
             auto *Ptr = cast<Constant>(GEP->getPointerOperand());
@@ -1528,12 +1530,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
                 Sub->getOpcode() == Instruction::Sub &&
                 Sub->getOperand(0)->isNullValue())
               FoldedValue = ConstantExpr::getSub(
-                  ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+                  ConstantExpr::getCast(Opcode, Ptr, IntIdxTy),
+                  Sub->getOperand(1));
           }
         }
       }
       if (FoldedValue) {
-        // Do a zext or trunc to get to the ptrtoint dest size.
+        // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size.
         return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false,
                                        DL);
       }
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 1f0da8d1..8d20b0e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -275,7 +275,7 @@ bool Dependence::isAnti() const {
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
 // Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level, bool isSameSD) const { return false; }
+bool Dependence::isScalar(unsigned level, bool IsSameSD) const { return false; }
 
 //===----------------------------------------------------------------------===//
 // FullDependence methods
@@ -351,38 +351,38 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
 
 // getDirection - Returns the direction associated with a particular common or
 // SameSD level.
-unsigned FullDependence::getDirection(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).Direction;
+unsigned FullDependence::getDirection(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).Direction;
 }
 
 // Returns the distance (or NULL) associated with a particular common or
 // SameSD level.
-const SCEV *FullDependence::getDistance(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).Distance;
+const SCEV *FullDependence::getDistance(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).Distance;
 }
 
 // Returns true if a particular regular or SameSD level is scalar; that is,
 // if no subscript in the source or destination mention the induction variable
 // associated with the loop at this level.
-bool FullDependence::isScalar(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).Scalar;
+bool FullDependence::isScalar(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).Scalar;
 }
 
 // Returns true if peeling the first iteration from this regular or SameSD
 // loop level will break this dependence.
-bool FullDependence::isPeelFirst(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).PeelFirst;
+bool FullDependence::isPeelFirst(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).PeelFirst;
 }
 
 // Returns true if peeling the last iteration from this regular or SameSD
 // loop level will break this dependence.
-bool FullDependence::isPeelLast(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).PeelLast;
+bool FullDependence::isPeelLast(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).PeelLast;
 }
 
 // Returns true if splitting loop will break the dependence.
-bool FullDependence::isSplitable(unsigned Level, bool isSameSD) const {
-  return getDVEntry(Level, isSameSD).Splitable;
+bool FullDependence::isSplitable(unsigned Level, bool IsSameSD) const {
+  return getDVEntry(Level, IsSameSD).Splitable;
 }
 
 // inSameSDLoops - Returns true if this level is an SameSD level, i.e.,
@@ -691,7 +691,7 @@ void Dependence::dump(raw_ostream &OS) const {
     dumpImp(OS);
     unsigned SameSDLevels = getSameSDLevels();
     if (SameSDLevels > 0) {
-      OS << "! / assuming " << SameSDLevels << " loop level(s) fused: ";
+      OS << " / assuming " << SameSDLevels << " loop level(s) fused: ";
       dumpImp(OS, true);
     }
   }
@@ -706,13 +706,13 @@ void Dependence::dump(raw_ostream &OS) const {
 
 // For debugging purposes. Dumps a dependence to OS with or without considering
 // the SameSD levels.
-void Dependence::dumpImp(raw_ostream &OS, bool isSameSD) const {
+void Dependence::dumpImp(raw_ostream &OS, bool IsSameSD) const {
   bool Splitable = false;
   unsigned Levels = getLevels();
   unsigned SameSDLevels = getSameSDLevels();
   bool OnSameSD = false;
   unsigned LevelNum = Levels;
-  if (isSameSD)
+  if (IsSameSD)
     LevelNum += SameSDLevels;
   OS << " [";
   for (unsigned II = 1; II <= LevelNum; ++II) {
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 1794a60..85b5372 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const {
 // Embedder and its subclasses
 //===----------------------------------------------------------------------===//
 
-Embedder::Embedder(const Function &F, const Vocabulary &Vocab)
-    : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
-      OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight),
-      FuncVector(Embedding(Dimension)) {}
-
 std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
                                            const Vocabulary &Vocab) {
   switch (Mode) {
@@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
   return nullptr;
 }
 
-const InstEmbeddingsMap &Embedder::getInstVecMap() const {
-  if (InstVecMap.empty())
-    computeEmbeddings();
-  return InstVecMap;
-}
-
-const BBEmbeddingsMap &Embedder::getBBVecMap() const {
-  if (BBVecMap.empty())
-    computeEmbeddings();
-  return BBVecMap;
-}
-
-const Embedding &Embedder::getBBVector(const BasicBlock &BB) const {
-  auto It = BBVecMap.find(&BB);
-  if (It != BBVecMap.end())
-    return It->second;
-  computeEmbeddings(BB);
-  return BBVecMap[&BB];
-}
+Embedding Embedder::computeEmbeddings() const {
+  Embedding FuncVector(Dimension, 0.0);
 
-const Embedding &Embedder::getFunctionVector() const {
-  // Currently, we always (re)compute the embeddings for the function.
-  // This is cheaper than caching the vector.
-  computeEmbeddings();
-  return FuncVector;
-}
-
-void Embedder::computeEmbeddings() const {
   if (F.isDeclaration())
-    return;
-
-  FuncVector = Embedding(Dimension, 0.0);
+    return FuncVector;
 
   // Consider only the basic blocks that are reachable from entry
-  for (const BasicBlock *BB : depth_first(&F)) {
-    computeEmbeddings(*BB);
-    FuncVector += BBVecMap[BB];
-  }
+  for (const BasicBlock *BB : depth_first(&F))
+    FuncVector += computeEmbeddings(*BB);
+  return FuncVector;
 }
 
-void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
+Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
   // We consider only the non-debug and non-pseudo instructions
-  for (const auto &I : BB.instructionsWithoutDebug()) {
-    Embedding ArgEmb(Dimension, 0);
-    for (const auto &Op : I.operands())
-      ArgEmb += Vocab[*Op];
-    auto InstVector =
-        Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
-    if (const auto *IC = dyn_cast<CmpInst>(&I))
-      InstVector += Vocab[IC->getPredicate()];
-    InstVecMap[&I] = InstVector;
-    BBVector += InstVector;
-  }
-  BBVecMap[&BB] = BBVector;
-}
-
-void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
-  Embedding BBVector(Dimension, 0);
+  for (const auto &I : BB.instructionsWithoutDebug())
+    BBVector += computeEmbeddings(I);
+  return BBVector;
+}
+
+Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const {
+  // Currently, we always (re)compute the embeddings for symbolic embedder.
+  // This is cheaper than caching the vectors.
+  Embedding ArgEmb(Dimension, 0);
+  for (const auto &Op : I.operands())
+    ArgEmb += Vocab[*Op];
+  auto InstVector =
+      Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+  if (const auto *IC = dyn_cast<CmpInst>(&I))
+    InstVector += Vocab[IC->getPredicate()];
+  return InstVector;
+}
+
+Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const {
+  // If we have already computed the embedding for this instruction, return it
+  auto It = InstVecMap.find(&I);
+  if (It != InstVecMap.end())
+    return It->second;
 
-  // We consider only the non-debug and non-pseudo instructions
-  for (const auto &I : BB.instructionsWithoutDebug()) {
-    // TODO: Handle call instructions differently.
-    // For now, we treat them like other instructions
-    Embedding ArgEmb(Dimension, 0);
-    for (const auto &Op : I.operands()) {
-      // If the operand is defined elsewhere, we use its embedding
-      if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
-        auto DefIt = InstVecMap.find(DefInst);
-        // Fixme (#159171): Ideally we should never miss an instruction
-        // embedding here.
-        // But when we have cyclic dependencies (e.g., phi
-        // nodes), we might miss the embedding. In such cases, we fall back to
-        // using the vocabulary embedding. This can be fixed by iterating to a
-        // fixed-point, or by using a simple solver for the set of simultaneous
-        // equations.
-        // Another case when we might miss an instruction embedding is when
-        // the operand instruction is in a different basic block that has not
-        // been processed yet. This can be fixed by processing the basic blocks
-        // in a topological order.
-        if (DefIt != InstVecMap.end())
-          ArgEmb += DefIt->second;
-        else
-          ArgEmb += Vocab[*Op];
-      }
-      // If the operand is not defined by an instruction, we use the vocabulary
-      else {
-        LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
-                          << *Op << "=" << Vocab[*Op][0] << "\n");
+  // TODO: Handle call instructions differently.
+  // For now, we treat them like other instructions
+  Embedding ArgEmb(Dimension, 0);
+  for (const auto &Op : I.operands()) {
+    // If the operand is defined elsewhere, we use its embedding
+    if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
+      auto DefIt = InstVecMap.find(DefInst);
+      // Fixme (#159171): Ideally we should never miss an instruction
+      // embedding here.
+      // But when we have cyclic dependencies (e.g., phi
+      // nodes), we might miss the embedding. In such cases, we fall back to
+      // using the vocabulary embedding. This can be fixed by iterating to a
+      // fixed-point, or by using a simple solver for the set of simultaneous
+      // equations.
+      // Another case when we might miss an instruction embedding is when
+      // the operand instruction is in a different basic block that has not
+      // been processed yet. This can be fixed by processing the basic blocks
+      // in a topological order.
+      if (DefIt != InstVecMap.end())
+        ArgEmb += DefIt->second;
+      else
         ArgEmb += Vocab[*Op];
-      }
     }
-    // Create the instruction vector by combining opcode, type, and arguments
-    // embeddings
-    auto InstVector =
-        Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
-    // Add compare predicate embedding as an additional operand if applicable
-    if (const auto *IC = dyn_cast<CmpInst>(&I))
-      InstVector += Vocab[IC->getPredicate()];
-    InstVecMap[&I] = InstVector;
-    BBVector += InstVector;
+    // If the operand is not defined by an instruction, we use the
+    // vocabulary
+    else {
+      LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
+                        << *Op << "=" << Vocab[*Op][0] << "\n");
+      ArgEmb += Vocab[*Op];
+    }
   }
-  BBVecMap[&BB] = BBVector;
+  // Create the instruction vector by combining opcode, type, and arguments
+  // embeddings
+  auto InstVector =
+      Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+  if (const auto *IC = dyn_cast<CmpInst>(&I))
+    InstVector += Vocab[IC->getPredicate()];
+  InstVecMap[&I] = InstVector;
+  return InstVector;
 }
 
 // ==----------------------------------------------------------------------===//
@@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
     Emb->getFunctionVector().print(OS);
 
     OS << "Basic block vectors:\n";
-    const auto &BBMap = Emb->getBBVecMap();
     for (const BasicBlock &BB : F) {
-      auto It = BBMap.find(&BB);
-      if (It != BBMap.end()) {
-        OS << "Basic block: " << BB.getName() << ":\n";
-        It->second.print(OS);
-      }
+      OS << "Basic block: " << BB.getName() << ":\n";
+      Emb->getBBVector(BB).print(OS);
     }
 
     OS << "Instruction vectors:\n";
-    const auto &InstMap = Emb->getInstVecMap();
     for (const BasicBlock &BB : F) {
       for (const Instruction &I : BB) {
-        auto It = InstMap.find(&I);
-        if (It != InstMap.end()) {
-          OS << "Instruction: ";
-          I.print(OS);
-          It->second.print(OS);
-        }
+        OS << "Instruction: ";
+        I.print(OS);
+        Emb->getInstVector(I).print(OS);
       }
     }
   }
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d1977f0..4e38626 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -671,12 +671,12 @@ Value *llvm::simplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
 /// This is very similar to stripAndAccumulateConstantOffsets(), except it
 /// normalizes the offset bitwidth to the stripped pointer type, not the
 /// original pointer type.
-static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
-                                            bool AllowNonInbounds = false) {
+static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
   APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType()));
-  V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
+  V = V->stripAndAccumulateConstantOffsets(DL, Offset,
+                                           /*AllowNonInbounds=*/true);
   // As that strip may trace through `addrspacecast`, need to sext or trunc
   // the offset calculated.
   return Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(V->getType()));
@@ -853,10 +853,12 @@ static Value *simplifySubInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
           return W;
 
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
-  if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y))))
+  if (match(Op0, m_PtrToIntOrAddr(m_Value(X))) &&
+      match(Op1, m_PtrToIntOrAddr(m_Value(Y)))) {
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
       return ConstantFoldIntegerCast(Result, Op0->getType(), /*IsSigned*/ true,
                                      Q.DL);
+  }
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 4c2e1fe..54f55b2 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) {
     auto *User = Worklist.pop_back_val();
     if (!Visited.insert(User).second)
       continue;
-    if (isa<ICmpInst, PtrToIntInst>(User))
+    // FIXME: The PtrToIntInst case here is not strictly correct, as it
+    // changes which provenance is exposed.
+    if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User))
       continue;
     if (isa<PHINode, SelectInst>(User))
       Worklist.append(User->user_begin(), User->user_end());
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f6776c..30bcff7 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
       return RewriteMap.lookup_or(S, S);
     };
 
-    // Check for the SCEV expression (A /u B) * B while B is a constant, inside
-    // \p Expr. The check is done recuresively on \p Expr, which is assumed to
-    // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A
-    // /u B) * B was found, and return the divisor B in \p DividesBy. For
-    // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since
-    // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p
-    // DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo =
-        [&](const SCEV *Expr, const SCEV *&DividesBy) {
-          if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) {
-            if (Mul->getNumOperands() != 2)
-              return false;
-            auto *MulLHS = Mul->getOperand(0);
-            auto *MulRHS = Mul->getOperand(1);
-            if (isa<SCEVConstant>(MulLHS))
-              std::swap(MulLHS, MulRHS);
-            if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS))
-              if (Div->getOperand(1) == MulRHS) {
-                DividesBy = MulRHS;
-                return true;
-              }
-          }
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) ||
-                   HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
-    // Return true if Expr known to divide by \p DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy =
-        [&](const SCEV *Expr, const SCEV *DividesBy) {
-          if (SE.getURemExpr(Expr, DividesBy)->isZero())
-            return true;
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) &&
-                   IsKnownToDivideBy(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
     const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
     const SCEV *DividesBy = nullptr;
-    if (HasDivisibiltyInfo(RewrittenLHS, DividesBy))
-      // Check that the whole expression is divided by DividesBy
-      DividesBy =
-          IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr;
+    const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS);
+    if (!Multiple.isOne())
+      DividesBy = SE.getConstant(Multiple);
 
     // Collect rewrites for LHS and its transitive operands based on the
     // condition.
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 6e92766..813632c 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -740,11 +740,6 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setAvailable(LibFunc_fgets_unlocked);
   }
 
-  if (T.isAndroid() && T.isAndroidVersionLT(21)) {
-    TLI.setUnavailable(LibFunc_stpcpy);
-    TLI.setUnavailable(LibFunc_stpncpy);
-  }
-
   if (T.isPS()) {
     // PS4/PS5 do have memalign.
     TLI.setAvailable(LibFunc_memalign);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index f6937d3..50d1d47 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -982,6 +982,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   DWKEYWORD(ATE, DwarfAttEncoding);
   DWKEYWORD(VIRTUALITY, DwarfVirtuality);
   DWKEYWORD(LANG, DwarfLang);
+  DWKEYWORD(LNAME, DwarfSourceLangName);
   DWKEYWORD(CC, DwarfCC);
   DWKEYWORD(OP, DwarfOp);
   DWKEYWORD(MACINFO, DwarfMacinfo);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5589966..380b192 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4740,6 +4740,10 @@ struct DwarfLangField : public MDUnsignedField {
   DwarfLangField() : MDUnsignedField(0, dwarf::DW_LANG_hi_user) {}
 };
 
+struct DwarfSourceLangNameField : public MDUnsignedField {
+  DwarfSourceLangNameField() : MDUnsignedField(0, UINT32_MAX) {}
+};
+
 struct DwarfCCField : public MDUnsignedField {
   DwarfCCField() : MDUnsignedField(0, dwarf::DW_CC_hi_user) {}
 };
@@ -4998,6 +5002,25 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
 }
 
 template <>
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
+                            DwarfSourceLangNameField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfSourceLangName)
+    return tokError("expected DWARF source language name");
+
+  unsigned Lang = dwarf::getSourceLanguageName(Lex.getStrVal());
+  if (!Lang)
+    return tokError("invalid DWARF source language name" + Twine(" '") +
+                    Lex.getStrVal() + "'");
+  assert(Lang <= Result.Max && "Expected valid DWARF source language name");
+  Result.assign(Lang);
+  Lex.Lex();
+  return false;
+}
+
+template <>
 bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
   if (Lex.getKind() == lltok::APSInt)
     return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
@@ -5836,9 +5859,12 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
     return tokError("missing 'distinct', required for !DICompileUnit");
 
+  LocTy Loc = Lex.getLoc();
+
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(language, DwarfLangField, );                                        \
   REQUIRED(file, MDField, (/* AllowNull */ false));                            \
+  OPTIONAL(language, DwarfLangField, );                                        \
+  OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, );                    \
   OPTIONAL(producer, MDStringField, );                                         \
   OPTIONAL(isOptimized, MDBoolField, );                                        \
   OPTIONAL(flags, MDStringField, );                                            \
@@ -5860,12 +5886,23 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  if (!language.Seen && !sourceLanguageName.Seen)
+    return error(Loc, "missing one of 'language' or 'sourceLanguageName', "
+                      "required for !DICompileUnit");
+
+  if (language.Seen && sourceLanguageName.Seen)
+    return error(Loc, "can only specify one of 'language' and "
+                      "'sourceLanguageName' on !DICompileUnit");
+
   Result = DICompileUnit::getDistinct(
-      Context, DISourceLanguageName(language.Val), file.Val, producer.Val,
-      isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val,
-      emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val,
-      macros.Val, dwoId.Val, splitDebugInlining.Val, debugInfoForProfiling.Val,
-      nameTableKind.Val, rangesBaseAddress.Val, sysroot.Val, sdk.Val);
+      Context,
+      language.Seen ? DISourceLanguageName(language.Val)
+                    : DISourceLanguageName(sourceLanguageName.Val, 0),
+      file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val,
+      splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val,
+      globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val,
+      debugInfoForProfiling.Val, nameTableKind.Val, rangesBaseAddress.Val,
+      sysroot.Val, sdk.Val);
   return false;
 }
 
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 969047a..55fa2df 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -893,6 +893,8 @@ StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
     return DefaultedMemberString(Val);
   case DW_AT_APPLE_enum_kind:
     return EnumKindString(Val);
+  case DW_AT_language_name:
+    return SourceLanguageNameString(static_cast<SourceLanguageName>(Val));
   }
 
   return StringRef();
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index a4d1b83..cdcf7a8 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1867,12 +1867,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // distinct.  It's always distinct.
     IsDistinct = true;
 
+    const auto LangVersionMask = (uint64_t(1) << 63);
+    const bool HasVersionedLanguage = Record[1] & LangVersionMask;
+
     auto *CU = DICompileUnit::getDistinct(
-        Context, DISourceLanguageName(Record[1]), getMDOrNull(Record[2]),
-        getMDString(Record[3]), Record[4], getMDString(Record[5]), Record[6],
-        getMDString(Record[7]), Record[8], getMDOrNull(Record[9]),
-        getMDOrNull(Record[10]), getMDOrNull(Record[12]),
-        getMDOrNull(Record[13]),
+        Context,
+        HasVersionedLanguage
+            ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0)
+            : DISourceLanguageName(Record[1]),
+        getMDOrNull(Record[2]), getMDString(Record[3]), Record[4],
+        getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8],
+        getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+        getMDOrNull(Record[12]), getMDOrNull(Record[13]),
         Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
         Record.size() <= 14 ? 0 : Record[14],
         Record.size() <= 16 ? true : Record[16],
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7ed140d..0ca55a26 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2108,7 +2108,13 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   assert(N->isDistinct() && "Expected distinct compile units");
   Record.push_back(/* IsDistinct */ true);
 
-  Record.push_back(N->getSourceLanguage().getUnversionedName());
+  auto Lang = N->getSourceLanguage();
+  Record.push_back(Lang.getName());
+  // Set bit so the MetadataLoader can distniguish between versioned and
+  // unversioned names.
+  if (Lang.hasVersionedName())
+    Record.back() ^= (uint64_t(1) << 63);
+
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
   Record.push_back(N->isOptimized());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index bc0bb34..f0f0861 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -587,10 +587,12 @@ bool DwarfExpression::addExpression(
         emitUnsigned(LeftShift);
         emitOp(dwarf::DW_OP_shl);
       }
-      emitOp(dwarf::DW_OP_constu);
-      emitUnsigned(RightShift);
-      emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
-                                                          : dwarf::DW_OP_shr);
+      if (RightShift) {
+        emitOp(dwarf::DW_OP_constu);
+        emitUnsigned(RightShift);
+        emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
+                                                            : dwarf::DW_OP_shr);
+      }
 
       // The value is now at the top of the stack, so set the location to
       // implicit so that we get a stack_value at the end.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index eb73d01b..4320b1d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3194,7 +3194,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
     case ScaledRegField:
       return ScaledReg;
     case BaseOffsField:
-      return ConstantInt::get(IntPtrTy, BaseOffs);
+      return ConstantInt::getSigned(IntPtrTy, BaseOffs);
     }
   }
 
@@ -6100,7 +6100,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
       // Add in the Base Offset if present.
       if (AddrMode.BaseOffs) {
-        Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+        Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs);
         if (ResultIndex) {
           // We need to add this separately from the scale above to help with
           // SDAG consecutive load/store merging.
@@ -6226,7 +6226,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
     // Add in the Base Offset if present.
     if (AddrMode.BaseOffs) {
-      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs);
       if (Result)
         Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index fa0ccd6..b425b95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1215,7 +1215,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
   LLT MemTy = LdSt.getMMO().getMemoryType();
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
       {{MemTy, MemTy.getSizeInBits().getKnownMinValue(),
-        AtomicOrdering::NotAtomic}});
+        AtomicOrdering::NotAtomic, AtomicOrdering::NotAtomic}});
   unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
   SmallVector<LLT> OpTys;
   if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
@@ -1728,6 +1728,7 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
     Result.clearSign();
     return Result;
   }
+  case TargetOpcode::G_FPEXT:
   case TargetOpcode::G_FPTRUNC: {
     bool Unused;
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 3f6813e..90c60d4 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -344,6 +344,22 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known = KnownBits::mul(Known, Known2);
     break;
   }
+  case TargetOpcode::G_UMULH: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::mulhu(Known, Known2);
+    break;
+  }
+  case TargetOpcode::G_SMULH: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::mulhs(Known, Known2);
+    break;
+  }
   case TargetOpcode::G_SELECT: {
     computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(),
                         Known, DemandedElts, Depth + 1);
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index b2f8435..cdc1f64 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -958,7 +958,8 @@ void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
   for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) {
     LLT Ty = LLT::scalar(Size);
     SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
-        {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}});
+        {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic,
+          AtomicOrdering::NotAtomic}});
     SmallVector<LLT> StoreTys({Ty, PtrTy});
     LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs);
     LegalizeActionStep ActionStep = LI.getAction(Q);
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 87565c0..e859765 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -49,14 +49,8 @@ cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
 //===----------------------------------------------------------------------===//
 
 MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
-                             const TargetInstrInfo *TII)
-    : TII(*TII) {
-  // Fixme: Use static factory methods for creating vocabularies instead of
-  // public constructors
-  // Early return for invalid inputs - creates empty/invalid vocabulary
-  if (!TII || OpcodeEntries.empty())
-    return;
-
+                             const TargetInstrInfo &TII)
+    : TII(TII) {
   buildCanonicalOpcodeMapping();
 
   unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size();
@@ -67,6 +61,15 @@ MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
   Layout.TotalEntries = Storage.size();
 }
 
+Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries,
+                                              const TargetInstrInfo &TII) {
+  if (Entries.empty())
+    return createStringError(errc::invalid_argument,
+                             "Empty vocabulary entries provided");
+
+  return MIRVocabulary(std::move(Entries), TII);
+}
+
 std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) {
   // Extract base instruction name using regex to capture letters and
   // underscores Examples: "ADD32rr" -> "ADD", "ARITH_FENCE" -> "ARITH_FENCE"
@@ -107,13 +110,11 @@ unsigned MIRVocabulary::getCanonicalIndexForBaseName(StringRef BaseName) const {
 }
 
 unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const {
-  assert(isValid() && "MIR2Vec Vocabulary is invalid");
   auto BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
   return getCanonicalIndexForBaseName(BaseOpcode);
 }
 
 std::string MIRVocabulary::getStringKey(unsigned Pos) const {
-  assert(isValid() && "MIR2Vec Vocabulary is invalid");
   assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary");
 
   // For now, all entries are opcodes since we only have one section
@@ -232,16 +233,11 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
   return Error::success();
 }
 
-void MIR2VecVocabLegacyAnalysis::emitError(Error Err, LLVMContext &Ctx) {
-  Ctx.emitError(toString(std::move(Err)));
-}
-
-mir2vec::MIRVocabulary
+Expected<mir2vec::MIRVocabulary>
 MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
   if (StrVocabMap.empty()) {
     if (Error Err = readVocabulary()) {
-      emitError(std::move(Err), M.getContext());
-      return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr);
+      return std::move(Err);
     }
   }
 
@@ -255,15 +251,13 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
 
     if (auto *MF = MMI.getMachineFunction(F)) {
       const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-      return mir2vec::MIRVocabulary(std::move(StrVocabMap), TII);
+      return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII);
     }
   }
 
-  // No machine functions available - return invalid vocabulary
-  emitError(make_error<StringError>("No machine functions found in module",
-                                    inconvertibleErrorCode()),
-            M.getContext());
-  return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr);
+  // No machine functions available - return error
+  return createStringError(errc::invalid_argument,
+                           "No machine functions found in module");
 }
 
 //===----------------------------------------------------------------------===//
@@ -284,13 +278,15 @@ bool MIR2VecVocabPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) {
 
 bool MIR2VecVocabPrinterLegacyPass::doFinalization(Module &M) {
   auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>();
-  auto MIR2VecVocab = Analysis.getMIR2VecVocabulary(M);
+  auto MIR2VecVocabOrErr = Analysis.getMIR2VecVocabulary(M);
 
-  if (!MIR2VecVocab.isValid()) {
-    OS << "MIR2Vec Vocabulary Printer: Invalid vocabulary\n";
+  if (!MIR2VecVocabOrErr) {
+    OS << "MIR2Vec Vocabulary Printer: Failed to get vocabulary - "
+       << toString(MIR2VecVocabOrErr.takeError()) << "\n";
     return false;
   }
 
+  auto &MIR2VecVocab = *MIR2VecVocabOrErr;
   unsigned Pos = 0;
   for (const auto &Entry : MIR2VecVocab) {
     OS << "Key: " << MIR2VecVocab.getStringKey(Pos++) << ": ";
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a9651c..89ed4da 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
 STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
 STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
+STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
 
 /// A command line option to turn software pipelining on or off.
 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -193,6 +194,13 @@ static cl::opt<bool>
     MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
                cl::desc("Use the MVE code generator for software pipelining"));
 
+/// A command line argument to limit the number of store instructions in the
+/// target basic block.
+static cl::opt<unsigned> SwpMaxNumStores(
+    "pipeliner-max-num-stores",
+    cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
+    cl::init(200));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
     return false;
   }
 
+  unsigned NumStores = 0;
+  for (MachineInstr &MI : *L.getHeader())
+    if (MI.mayStore())
+      ++NumStores;
+  if (NumStores > SwpMaxNumStores) {
+    LLVM_DEBUG(dbgs() << "Too many stores\n");
+    NumFailTooManyStores++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Too many store instructions in the loop: "
+             << ore::NV("NumStores", NumStores) << " > "
+             << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
+    });
+    return false;
+  }
+
   // Remove any subregisters from inputs to phi nodes.
   preprocessPhiNodes(*L.getHeader());
   return true;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index ebfea8e..e17a214 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -2051,6 +2051,12 @@ bool RegisterCoalescer::joinCopy(
   }
 
   if (CP.getNewRC()) {
+    if (RegClassInfo.getNumAllocatableRegs(CP.getNewRC()) == 0) {
+      LLVM_DEBUG(dbgs() << "\tNo " << TRI->getRegClassName(CP.getNewRC())
+                        << "are available for allocation\n");
+      return false;
+    }
+
     auto SrcRC = MRI->getRegClass(CP.getSrcReg());
     auto DstRC = MRI->getRegClass(CP.getDstReg());
     unsigned SrcIdx = CP.getSrcIdx();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c5c3866..5ffdc4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19340,8 +19340,10 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
   EVT VT = N->getValueType(0);
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opc = N->getOpcode();
-  bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
-  bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
+  bool PropAllNaNsToQNaNs = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
+  bool PropOnlySNaNsToQNaNs = Opc == ISD::FMINNUM || Opc == ISD::FMAXNUM;
+  bool IsMin =
+      Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM || Opc == ISD::FMINIMUMNUM;
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   // Constant fold.
@@ -19356,34 +19358,53 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
   if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
     const APFloat &AF = N1CFP->getValueAPF();
 
-    // minnum(X, nan) -> X
-    // maxnum(X, nan) -> X
-    // minimum(X, nan) -> nan
-    // maximum(X, nan) -> nan
-    if (AF.isNaN())
-      return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
+    // minnum(X, qnan) -> X
+    // maxnum(X, qnan) -> X
+    // minnum(X, snan) -> qnan
+    // maxnum(X, snan) -> qnan
+    // minimum(X, nan) -> qnan
+    // maximum(X, nan) -> qnan
+    // minimumnum(X, nan) -> X
+    // maximumnum(X, nan) -> X
+    if (AF.isNaN()) {
+      if (PropAllNaNsToQNaNs || (AF.isSignaling() && PropOnlySNaNsToQNaNs)) {
+        if (AF.isSignaling())
+          return DAG.getConstantFP(AF.makeQuiet(), SDLoc(N), VT);
+        return N->getOperand(1);
+      }
+      return N->getOperand(0);
+    }
 
     // In the following folds, inf can be replaced with the largest finite
     // float, if the ninf flag is set.
     if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
-      // minnum(X, -inf) -> -inf
-      // maxnum(X, +inf) -> +inf
+      // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation)
+      // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation)
       // minimum(X, -inf) -> -inf if nnan
       // maximum(X, +inf) -> +inf if nnan
-      if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
+      // minimumnum(X, -inf) -> -inf
+      // maximumnum(X, +inf) -> +inf
+      if (IsMin == AF.isNegative() &&
+          (!PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
         return N->getOperand(1);
 
       // minnum(X, +inf) -> X if nnan
       // maxnum(X, -inf) -> X if nnan
-      // minimum(X, +inf) -> X
-      // maximum(X, -inf) -> X
-      if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
+      // minimum(X, +inf) -> X (ignoring quieting of sNaNs)
+      // maximum(X, -inf) -> X (ignoring quieting of sNaNs)
+      // minimumnum(X, +inf) -> X if nnan
+      // maximumnum(X, -inf) -> X if nnan
+      if (IsMin != AF.isNegative() && (PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
         return N->getOperand(0);
     }
   }
 
+  // There are no VECREDUCE variants of FMINIMUMNUM or FMAXIMUMNUM
+  if (Opc == ISD::FMINIMUMNUM || Opc == ISD::FMAXIMUMNUM)
+    return SDValue();
+
   if (SDValue SD = reassociateReduction(
-          PropagatesNaN
+          PropAllNaNsToQNaNs
               ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM)
               : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX),
           Opc, SDLoc(N), VT, N0, N1, Flags))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453..3b5f83f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3416,7 +3416,7 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
   SDValue Input2 = N->getOperand(2);
 
   SDValue AccLo, AccHi;
-  std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL);
+  GetSplitVector(Acc, AccLo, AccHi);
   unsigned Opcode = N->getOpcode();
 
   // If the input types don't need splitting, just accumulate into the
@@ -3429,8 +3429,8 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
 
   SDValue Input1Lo, Input1Hi;
   SDValue Input2Lo, Input2Hi;
-  std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
-  std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(Input2, DL);
+  GetSplitVector(Input1, Input1Lo, Input1Hi);
+  GetSplitVector(Input2, Input2Lo, Input2Hi);
   EVT ResultVT = AccLo.getValueType();
 
   Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo);
@@ -4761,8 +4761,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) {
 
   SDLoc DL(N);
   SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi;
-  std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(N->getOperand(1), DL);
-  std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL);
+  GetSplitVector(N->getOperand(1), Input1Lo, Input1Hi);
+  GetSplitVector(N->getOperand(2), Input2Lo, Input2Hi);
   unsigned Opcode = N->getOpcode();
   EVT ResultVT = Acc.getValueType();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 175753f..6c11c5b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -234,6 +234,19 @@ static bool dontUseFastISelFor(const Function &Fn) {
   });
 }
 
+static bool maintainPGOProfile(const TargetMachine &TM,
+                               CodeGenOptLevel OptLevel) {
+  if (OptLevel != CodeGenOptLevel::None)
+    return true;
+  if (TM.getPGOOption()) {
+    const PGOOptions &Options = *TM.getPGOOption();
+    return Options.Action == PGOOptions::PGOAction::IRUse ||
+           Options.Action == PGOOptions::PGOAction::SampleUse ||
+           Options.CSAction == PGOOptions::CSPGOAction::CSIRUse;
+  }
+  return false;
+}
+
 namespace llvm {
 
   //===--------------------------------------------------------------------===//
@@ -395,6 +408,7 @@ SelectionDAGISel::~SelectionDAGISel() { delete CurDAG; }
 
 void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   CodeGenOptLevel OptLevel = Selector->OptLevel;
+  bool RegisterPGOPasses = maintainPGOProfile(Selector->TM, Selector->OptLevel);
   if (OptLevel != CodeGenOptLevel::None)
       AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
@@ -403,15 +417,15 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<AssumptionCacheTracker>();
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
-      AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  if (UseMBPI && RegisterPGOPasses)
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   // AssignmentTrackingAnalysis only runs if assignment tracking is enabled for
   // the module.
   AU.addRequired<AssignmentTrackingAnalysis>();
   AU.addPreserved<AssignmentTrackingAnalysis>();
-  if (OptLevel != CodeGenOptLevel::None)
-      LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  if (RegisterPGOPasses)
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -464,6 +478,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   (void)MatchFilterFuncName;
 #endif
 
+  bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
@@ -474,7 +489,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   auto *PSI = MAMP.getCachedResult<ProfileSummaryAnalysis>(*Fn.getParent());
   BlockFrequencyInfo *BFI = nullptr;
   FAM.getResult<BlockFrequencyAnalysis>(Fn);
-  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+  if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
     BFI = &FAM.getResult<BlockFrequencyAnalysis>(Fn);
 
   FunctionVarLocs const *FnVarLocs = nullptr;
@@ -492,7 +507,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   // into account).  That's unfortunate but OK because it just means we won't
   // ask for passes that have been required anyway.
 
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+  if (UseMBPI && RegisterPGOPasses)
     FuncInfo->BPI = &FAM.getResult<BranchProbabilityAnalysis>(Fn);
   else
     FuncInfo->BPI = nullptr;
@@ -518,6 +533,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   (void)MatchFilterFuncName;
 #endif
 
+  bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
@@ -528,7 +544,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   AC = &MFP.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(Fn);
   auto *PSI = &MFP.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   BlockFrequencyInfo *BFI = nullptr;
-  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+  if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
     BFI = &MFP.getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
 
   FunctionVarLocs const *FnVarLocs = nullptr;
@@ -549,7 +565,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   // into account).  That's unfortunate but OK because it just means we won't
   // ask for passes that have been required anyway.
 
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+  if (UseMBPI && RegisterPGOPasses)
     FuncInfo->BPI =
         &MFP.getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index edc69a3..212a0c0 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -149,7 +149,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   if (!Name.empty())
     WithColor(OS, Color) << Name;
   else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
-           Attr == DW_AT_call_line || Attr == DW_AT_call_column) {
+           Attr == DW_AT_call_line || Attr == DW_AT_call_column ||
+           Attr == DW_AT_language_version) {
     if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
       OS << *Val;
     else
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5980ee3..286ed03 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3623,7 +3623,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
   // 1. Build a list of reduction variables.
   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = ReductionInfos.size();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
+  Type *FuncPtrTy =
+      Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
   Type *RedArrayTy = ArrayType::get(PtrTy, Size);
   CodeGenIP = Builder.saveIP();
   Builder.restoreIP(AllocaIP);
@@ -3667,9 +3669,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
       Builder.getInt64(MaxDataSize * ReductionInfos.size());
   if (!IsTeamsReduction) {
     Value *SarFuncCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
     Value *WcFuncCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
     Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
                      WcFuncCast};
     Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
@@ -10072,13 +10074,14 @@ void OpenMPIRBuilder::initializeTypes(Module &M) {
   LLVMContext &Ctx = M.getContext();
   StructType *T;
   unsigned DefaultTargetAS = Config.getDefaultTargetAS();
+  unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
   VarName##Ty = ArrayType::get(ElemTy, ArraySize);                             \
   VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
   VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
-  VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
+  VarName##Ptr = PointerType::get(Ctx, ProgramAS);
 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...)                      \
   T = StructType::getTypeByName(Ctx, StructName);                              \
   if (!T)                                                                      \
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ae086bcd..0bc877d 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2370,10 +2370,16 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Out << "!DICompileUnit(";
   MDFieldPrinter Printer(Out, WriterCtx);
 
-  Printer.printDwarfEnum("language",
-                         N->getSourceLanguage().getUnversionedName(),
-                         dwarf::LanguageString,
-                         /* ShouldSkipZero */ false);
+  auto Lang = N->getSourceLanguage();
+  if (Lang.hasVersionedName())
+    Printer.printDwarfEnum(
+        "sourceLanguageName",
+        static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()),
+        dwarf::SourceLanguageNameString,
+        /* ShouldSkipZero */ false);
+  else
+    Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString,
+                           /* ShouldSkipZero */ false);
 
   Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
   Printer.printString("producer", N->getProducer());
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 7509188..fba6942 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -391,3 +391,23 @@ ConstantFPRange ConstantFPRange::unionWith(const ConstantFPRange &CR) const {
   return ConstantFPRange(minnum(Lower, CR.Lower), maxnum(Upper, CR.Upper),
                          MayBeQNaN | CR.MayBeQNaN, MayBeSNaN | CR.MayBeSNaN);
 }
+
+ConstantFPRange ConstantFPRange::abs() const {
+  if (isNaNOnly())
+    return *this;
+  // Check if the range is all non-negative or all non-positive.
+  if (Lower.isNegative() == Upper.isNegative()) {
+    if (Lower.isNegative())
+      return negate();
+    return *this;
+  }
+  // The range contains both positive and negative values.
+  APFloat NewLower = APFloat::getZero(getSemantics());
+  APFloat NewUpper = maxnum(-Lower, Upper);
+  return ConstantFPRange(std::move(NewLower), std::move(NewUpper), MayBeQNaN,
+                         MayBeSNaN);
+}
+
+ConstantFPRange ConstantFPRange::negate() const {
+  return ConstantFPRange(-Upper, -Lower, MayBeQNaN, MayBeSNaN);
+}
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1ae20a9f..07a870f 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType(
 
 DICompositeType *DIBuilder::createVectorType(uint64_t Size,
                                              uint32_t AlignInBits, DIType *Ty,
-                                             DINodeArray Subscripts) {
-  auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
-                                 nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
-                                 DINode::FlagVector, Subscripts, 0,
-                                 /*EnumKind=*/std::nullopt, nullptr);
+                                             DINodeArray Subscripts,
+                                             Metadata *BitStride) {
+  auto *R = DICompositeType::get(
+      VMContext, dwarf::DW_TAG_array_type, /*Name=*/"",
+      /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty,
+      /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0,
+      /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts,
+      /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr,
+      /*TemplateParams=*/nullptr, /*Identifier=*/"",
+      /*Discriminator=*/nullptr, /*DataLocation=*/nullptr,
+      /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr,
+      /*Annotations=*/nullptr, /*Specification=*/nullptr,
+      /*NumExtraInhabitants=*/0,
+      /*BitStride=*/BitStride);
   trackIfUnresolved(R);
   return R;
 }
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 4f37624..8e6d654 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   C.print(OS);
 }
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   BranchProbability P)
+    : Key(std::string(Key)) {
+  raw_string_ostream OS(Val);
+  P.print(OS);
+}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
     : Key(std::string(Key)), Loc(Loc) {
   if (Loc) {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 47860c0..708e79d 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -20,7 +20,7 @@ using namespace llvm::mustache;
 
 namespace {
 
-using Accessor = SmallVector<std::string>;
+using Accessor = ArrayRef<StringRef>;
 
 static bool isFalsey(const json::Value &V) {
   return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) ||
@@ -34,23 +34,32 @@ static bool isContextFalsey(const json::Value *V) {
   return isFalsey(*V);
 }
 
-static Accessor splitMustacheString(StringRef Str) {
+static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
   // We split the mustache string into an accessor.
   // For example:
   //    "a.b.c" would be split into {"a", "b", "c"}
   // We make an exception for a single dot which
   // refers to the current context.
-  Accessor Tokens;
+  SmallVector<StringRef> Tokens;
   if (Str == ".") {
-    Tokens.emplace_back(Str);
-    return Tokens;
-  }
-  while (!Str.empty()) {
-    StringRef Part;
-    std::tie(Part, Str) = Str.split(".");
-    Tokens.emplace_back(Part.trim());
+    // "." is a special accessor that refers to the current context.
+    // It's a literal, so it doesn't need to be saved.
+    Tokens.push_back(".");
+  } else {
+    while (!Str.empty()) {
+      StringRef Part;
+      std::tie(Part, Str) = Str.split('.');
+      // Each part of the accessor needs to be saved to the arena
+      // to ensure it has a stable address.
+      Tokens.push_back(Ctx.Saver.save(Part.trim()));
+    }
   }
-  return Tokens;
+  // Now, allocate memory for the array of StringRefs in the arena.
+  StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
+  // Copy the StringRefs from the stack vector to the arena.
+  std::copy(Tokens.begin(), Tokens.end(), ArenaTokens);
+  // Return an ArrayRef pointing to the stable arena memory.
+  return ArrayRef<StringRef>(ArenaTokens, Tokens.size());
 }
 } // namespace
 
@@ -97,23 +106,23 @@ public:
     SetDelimiter,
   };
 
-  Token(std::string Str)
-      : TokenType(Type::Text), RawBody(std::move(Str)), TokenBody(RawBody),
+  Token(StringRef Str)
+      : TokenType(Type::Text), RawBody(Str), TokenBody(RawBody),
         AccessorValue({}), Indentation(0) {};
 
-  Token(std::string RawBody, std::string TokenBody, char Identifier)
-      : RawBody(std::move(RawBody)), TokenBody(std::move(TokenBody)),
-        Indentation(0) {
+  Token(StringRef RawBody, StringRef TokenBody, char Identifier,
+        MustacheContext &Ctx)
+      : RawBody(RawBody), TokenBody(TokenBody), Indentation(0) {
     TokenType = getTokenType(Identifier);
     if (TokenType == Type::Comment)
       return;
     StringRef AccessorStr(this->TokenBody);
     if (TokenType != Type::Variable)
       AccessorStr = AccessorStr.substr(1);
-    AccessorValue = splitMustacheString(StringRef(AccessorStr).trim());
+    AccessorValue = splitMustacheString(StringRef(AccessorStr).trim(), Ctx);
   }
 
-  Accessor getAccessor() const { return AccessorValue; }
+  ArrayRef<StringRef> getAccessor() const { return AccessorValue; }
 
   Type getType() const { return TokenType; }
 
@@ -144,16 +153,16 @@ public:
 
   Type TokenType;
   // RawBody is the original string that was tokenized.
-  std::string RawBody;
+  StringRef RawBody;
   // TokenBody is the original string with the identifier removed.
-  std::string TokenBody;
-  Accessor AccessorValue;
+  StringRef TokenBody;
+  ArrayRef<StringRef> AccessorValue;
   size_t Indentation;
 };
 
 using EscapeMap = DenseMap<char, std::string>;
 
-class ASTNode {
+class ASTNode : public ilist_node<ASTNode> {
 public:
   enum Type {
     Root,
@@ -168,18 +177,19 @@ public:
   ASTNode(MustacheContext &Ctx)
       : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {}
 
-  ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent)
-      : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
+  ASTNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Type::Text), Body(Body), Parent(Parent),
         ParentContext(nullptr) {}
 
   // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
-  ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent)
-      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)),
+  ASTNode(MustacheContext &Ctx, Type Ty, ArrayRef<StringRef> Accessor,
+          ASTNode *Parent)
+      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(Accessor),
         ParentContext(nullptr) {}
 
-  void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); };
+  void addChild(AstPtr Child) { Children.push_back(Child); };
 
-  void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); };
+  void setRawBody(StringRef NewBody) { RawBody = NewBody; };
 
   void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
 
@@ -212,28 +222,27 @@ private:
   MustacheContext &Ctx;
   Type Ty;
   size_t Indentation = 0;
-  std::string RawBody;
-  std::string Body;
+  StringRef RawBody;
+  StringRef Body;
   ASTNode *Parent;
-  // TODO: switch implementation to SmallVector<T>
-  std::vector<AstPtr> Children;
-  const Accessor AccessorValue;
+  ASTNodeList Children;
+  const ArrayRef<StringRef> AccessorValue;
   const llvm::json::Value *ParentContext;
 };
 
 // A wrapper for arena allocator for ASTNodes
 static AstPtr createRootNode(MustacheContext &Ctx) {
-  return std::make_unique<ASTNode>(Ctx);
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx);
 }
 
-static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A,
-                         ASTNode *Parent) {
-  return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent);
+static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T,
+                         ArrayRef<StringRef> A, ASTNode *Parent) {
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, T, A, Parent);
 }
 
-static AstPtr createTextNode(MustacheContext &Ctx, std::string Body,
+static AstPtr createTextNode(MustacheContext &Ctx, StringRef Body,
                              ASTNode *Parent) {
-  return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent);
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, Body, Parent);
 }
 
 // Function to check if there is meaningful text behind.
@@ -295,9 +304,9 @@ static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
   StringRef NextTokenBody = NextToken.TokenBody;
   // Cut off the leading newline which could be \n or \r\n.
   if (NextTokenBody.starts_with("\r\n"))
-    NextToken.TokenBody = NextTokenBody.substr(2).str();
+    NextToken.TokenBody = NextTokenBody.substr(2);
   else if (NextTokenBody.starts_with("\n"))
-    NextToken.TokenBody = NextTokenBody.substr(1).str();
+    NextToken.TokenBody = NextTokenBody.substr(1);
 }
 
 // Adjust previous token body if there no text behind.
@@ -312,7 +321,7 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
   StringRef PrevTokenBody = PrevToken.TokenBody;
   StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
   size_t Indentation = PrevTokenBody.size() - Unindented.size();
-  PrevToken.TokenBody = Unindented.str();
+  PrevToken.TokenBody = Unindented;
   CurrentToken.setIndentation(Indentation);
 }
 
@@ -402,21 +411,20 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
 }
 
 static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
+processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
                     << ", Kind: " << tagKindToString(T.TagKind) << "\n");
   if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
+    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
     return std::nullopt;
   }
   StringRef Interpolated = T.Content;
-  std::string RawBody = T.FullMatch.str();
   if (!Interpolated.trim().starts_with("=")) {
     char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(RawBody, Interpolated.str(), Front);
+    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
     return std::nullopt;
   }
-  Tokens.emplace_back(RawBody, Interpolated.str(), '=');
+  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
   StringRef DelimSpec = Interpolated.trim();
   DelimSpec = DelimSpec.drop_front(1);
   DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
@@ -432,7 +440,7 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
 // The mustache spec allows {{{ }}} to unescape variables,
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
-static SmallVector<Token> tokenize(StringRef Template) {
+static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
@@ -446,19 +454,17 @@ static SmallVector<Token> tokenize(StringRef Template) {
 
     if (T.TagKind == Tag::Kind::None) {
       // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start).str());
-      LLVM_DEBUG(dbgs() << "  No more tags. Created final Text token: \""
-                        << Template.substr(Start) << "\"\n");
+      Tokens.emplace_back(Template.substr(Start));
       break;
     }
 
     // Add the text before the tag.
     if (T.StartPosition > Start) {
       StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text.str());
+      Tokens.emplace_back(Text);
     }
 
-    if (auto NewDelims = processTag(T, Tokens)) {
+    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
       std::tie(Open, Close) = *NewDelims;
     }
 
@@ -614,20 +620,20 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
                           const Accessor &A) {
   AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent);
   size_t Start = CurrentPtr;
-  parseMustache(CurrentNode.get());
+  parseMustache(CurrentNode);
   const size_t End = CurrentPtr - 1;
-  std::string RawBody;
+  SmallString<128> RawBody;
   for (std::size_t I = Start; I < End; I++)
     RawBody += Tokens[I].RawBody;
-  CurrentNode->setRawBody(std::move(RawBody));
-  Parent->addChild(std::move(CurrentNode));
+  CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
+  Parent->addChild(CurrentNode);
 }
 
 AstPtr Parser::parse() {
-  Tokens = tokenize(TemplateStr);
+  Tokens = tokenize(TemplateStr, Ctx);
   CurrentPtr = 0;
   AstPtr RootNode = createRootNode(Ctx);
-  parseMustache(RootNode.get());
+  parseMustache(RootNode);
   return RootNode;
 }
 
@@ -636,31 +642,29 @@ void Parser::parseMustache(ASTNode *Parent) {
   while (CurrentPtr < Tokens.size()) {
     Token CurrentToken = Tokens[CurrentPtr];
     CurrentPtr++;
-    Accessor A = CurrentToken.getAccessor();
+    ArrayRef<StringRef> A = CurrentToken.getAccessor();
     AstPtr CurrentNode;
 
     switch (CurrentToken.getType()) {
     case Token::Type::Text: {
-      CurrentNode =
-          createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createTextNode(Ctx, CurrentToken.TokenBody, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::Variable: {
-      CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createNode(Ctx, ASTNode::Variable, A, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::UnescapeVariable: {
-      CurrentNode =
-          createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createNode(Ctx, ASTNode::UnescapeVariable, A, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::Partial: {
-      CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent);
+      CurrentNode = createNode(Ctx, ASTNode::Partial, A, Parent);
       CurrentNode->setIndentation(CurrentToken.getIndentation());
-      Parent->addChild(std::move(CurrentNode));
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::SectionOpen: {
@@ -694,8 +698,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
     return;
   }
   case json::Value::String: {
-    auto Str = *Data.getAsString();
-    OS << Str.str();
+    OS << *Data.getAsString();
     return;
   }
 
@@ -727,7 +730,7 @@ void ASTNode::renderPartial(const json::Value &CurrentCtx,
                     << ", Indentation:" << Indentation << "\n");
   auto Partial = Ctx.Partials.find(AccessorValue[0]);
   if (Partial != Ctx.Partials.end())
-    renderPartial(CurrentCtx, OS, Partial->getValue().get());
+    renderPartial(CurrentCtx, OS, Partial->getValue());
 }
 
 void ASTNode::renderVariable(const json::Value &CurrentCtx,
@@ -858,8 +861,8 @@ const json::Value *ASTNode::findContext() {
 
 void ASTNode::renderChild(const json::Value &Contexts,
                           MustacheOutputStream &OS) {
-  for (AstPtr &Child : Children)
-    Child->render(Contexts, OS);
+  for (ASTNode &Child : Children)
+    Child.render(Contexts, OS);
 }
 
 void ASTNode::renderPartial(const json::Value &Contexts,
@@ -869,7 +872,7 @@ void ASTNode::renderPartial(const json::Value &Contexts,
   Partial->render(Contexts, IS);
 }
 
-void ASTNode::renderLambdas(const json::Value &Contexts,
+void ASTNode::renderLambdas(const llvm::json::Value &Contexts,
                             MustacheOutputStream &OS, Lambda &L) {
   json::Value LambdaResult = L();
   std::string LambdaStr;
@@ -886,9 +889,9 @@ void ASTNode::renderLambdas(const json::Value &Contexts,
   LambdaNode->render(Contexts, OS);
 }
 
-void ASTNode::renderSectionLambdas(const json::Value &Contexts,
+void ASTNode::renderSectionLambdas(const llvm::json::Value &Contexts,
                                    MustacheOutputStream &OS, SectionLambda &L) {
-  json::Value Return = L(RawBody);
+  json::Value Return = L(RawBody.str());
   if (isFalsey(Return))
     return;
   std::string LambdaStr;
@@ -899,15 +902,16 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
   LambdaNode->render(Contexts, OS);
 }
 
-void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
+void Template::render(const llvm::json::Value &Data, llvm::raw_ostream &OS) {
   RawMustacheOutputStream MOS(OS);
   Tree->render(Data, MOS);
 }
 
 void Template::registerPartial(std::string Name, std::string Partial) {
-  Parser P(Partial, Ctx);
+  StringRef SavedPartial = Ctx.Saver.save(Partial);
+  Parser P(SavedPartial, Ctx);
   AstPtr PartialTree = P.parse();
-  Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree)));
+  Ctx.Partials.insert(std::make_pair(Name, PartialTree));
 }
 
 void Template::registerLambda(std::string Name, Lambda L) {
@@ -922,7 +926,7 @@ void Template::overrideEscapeCharacters(EscapeMap E) {
   Ctx.Escapes = std::move(E);
 }
 
-Template::Template(StringRef TemplateStr) {
+Template::Template(StringRef TemplateStr, MustacheContext &Ctx) : Ctx(Ctx) {
   Parser P(TemplateStr, Ctx);
   Tree = P.parse();
   // The default behavior is to escape html entities.
@@ -935,18 +939,12 @@ Template::Template(StringRef TemplateStr) {
 }
 
 Template::Template(Template &&Other) noexcept
-    : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {}
+    : Ctx(Other.Ctx), Tree(Other.Tree) {
+  Other.Tree = nullptr;
+}
 
 Template::~Template() = default;
 
-Template &Template::operator=(Template &&Other) noexcept {
-  if (this != &Other) {
-    Ctx = std::move(Other.Ctx);
-    Tree = std::move(Other.Tree);
-    Other.Tree = nullptr;
-  }
-  return *this;
-}
 } // namespace llvm::mustache
 
 #undef DEBUG_TYPE
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 639ddcb..ecaeff7 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering
 // Post-legalization combines which are primarily optimizations.
 def AArch64PostLegalizerCombiner
     : GICombiner<"AArch64PostLegalizerCombinerImpl",
-                       [copy_prop, cast_of_cast_combines, 
+                       [copy_prop, cast_of_cast_combines, constant_fold_fp_ops, 
                         buildvector_of_truncate, integer_of_truncate,
                         mutate_anyext_to_zext, combines_for_extload, 
                         combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c8..7294f3e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1458,6 +1458,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
       setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
       setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
 
       if (Subtarget->hasMatMulInt8()) {
@@ -16248,7 +16249,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
 
   bool Negated;
   uint64_t SplatVal;
-  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+  // NOTE: SRAD cannot be used to represent sdiv-by-one.
+  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+      SplatVal > 1) {
     SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
     SDValue Res =
         DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
@@ -30033,7 +30036,9 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
 
   bool Negated;
   uint64_t SplatVal;
-  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+  // NOTE: SRAD cannot be used to represent sdiv-by-one.
+  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+      SplatVal > 1) {
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
     SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
     SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
@@ -30605,6 +30610,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
 
+  if (Op->getNumOperands() == 3) {
+    // aarch64_sve_ld3 only supports packed datatypes.
+    EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+    Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+    SDValue StackPtr =
+        DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+    // Write out unmodified operands.
+    SmallVector<SDValue, 3> Chains;
+    for (unsigned I = 0; I < 3; ++I) {
+      SDValue Ptr =
+          DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+      SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
+      Chains.push_back(
+          DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
+    }
+
+    Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
+    EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+    SmallVector<SDValue, 7> Ops;
+    Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.push_back(DAG.getConstant(1, DL, PredVT));
+    Ops.push_back(StackPtr);
+
+    // Read back and deinterleave data.
+    SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
+    SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
+
+    SmallVector<SDValue, 3> Results;
+    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
+    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
+    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
+    return DAG.getMergeValues(Results, DL);
+  }
+
   // Are multi-register uzp instructions available?
   if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
       OpVT.getVectorElementType() != MVT::i1) {
@@ -30646,6 +30688,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
 
+  if (Op->getNumOperands() == 3) {
+    // aarch64_sve_st3 only supports packed datatypes.
+    EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+    SmallVector<SDValue, 3> InVecs;
+    for (SDValue V : Op->ops())
+      InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
+
+    Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+    SDValue StackPtr =
+        DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+    Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
+    EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+    SmallVector<SDValue, 7> Ops;
+    Ops.push_back(DAG.getEntryNode());
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.append(InVecs);
+    Ops.push_back(DAG.getConstant(1, DL, PredVT));
+    Ops.push_back(StackPtr);
+
+    // Interleave operands and store.
+    SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
+
+    // Read back the interleaved data.
+    SmallVector<SDValue, 3> Results;
+    for (unsigned I = 0; I < 3; ++I) {
+      SDValue Ptr =
+          DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+      SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
+      Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
+    }
+
+    return DAG.getMergeValues(Results, DL);
+  }
+
   // Are multi-register zip instructions available?
   if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
       OpVT.getVectorElementType() != MVT::i1) {
@@ -30769,6 +30847,17 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
       ResultVT.isFixedLengthVector() &&
       useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
 
+  // We can handle this case natively by accumulating into a wider
+  // zero-padded vector.
+  if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
+    SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
+    SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
+    SDValue Wide =
+        DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
+    SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
+    return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
+  }
+
   if (ConvertToScalable) {
     ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
     OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 91e64e6..bd0a17d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -315,6 +315,8 @@ public:
   }
 
   void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+    assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+           "expected SVE stack sizes to be aligned to 16-bytes");
     StackSizeZPR = ZPR;
     StackSizePPR = PPR;
     HasCalculatedStackSizeSVE = true;
@@ -425,6 +427,8 @@ public:
 
   // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
   void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+    assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+           "expected SVE callee-save sizes to be aligned to 16-bytes");
     ZPRCalleeSavedStackSize = ZPR;
     PPRCalleeSavedStackSize = PPR;
     HasSVECalleeSavedStackSize = true;
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 1568161..f110558 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
   case AArch64::PTRUE_C_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
-  case AArch64::SEH_SavePReg:
   case AArch64::SEH_SaveZReg:
     return true;
   }
@@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
   case AArch64::LDR_PXI:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
+  case AArch64::SEH_SavePReg:
+    return true;
   }
 }
 
@@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
 
   HasFP = AFL.hasFP(MF);
   NeedsWinCFI = AFL.needsWinCFI(MF);
+
+  // Windows unwind can't represent the required stack adjustments if we have
+  // both SVE callee-saves and dynamic stack allocations, and the frame pointer
+  // is before the SVE spills.  The allocation of the frame pointer must be the
+  // last instruction in the prologue so the unwinder can restore the stack
+  // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29,
+  // -17`.)
+  //
+  // Because of this, we do spills in the opposite order on Windows: first SVE,
+  // then GPRs. The main side-effect of this is that it makes accessing
+  // parameters passed on the stack more expensive.
+  //
+  // We could consider rearranging the spills for simpler cases.
+  if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) {
+    if (AFI->hasStackHazardSlotIndex())
+      reportFatalUsageError("SME hazard padding is not supported on Windows");
+    SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord;
+  } else if (AFI->hasSplitSVEObjects()) {
+    SVELayout = SVEStackLayout::Split;
+  }
 }
 
 MachineBasicBlock::iterator
@@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
+  StackOffset PPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+  StackOffset ZPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+  if (SVELayout == SVEStackLayout::Split)
+    return {{PPRCalleeSavesSize, PPRLocalsSize},
+            {ZPRCalleeSavesSize, ZPRLocalsSize}};
+  // For simplicity, attribute all locals to ZPRs when split SVE is disabled.
+  return {{PPRCalleeSavesSize, StackOffset{}},
+          {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
+}
+
+struct SVEPartitions {
+  struct {
+    MachineBasicBlock::iterator Begin, End;
+  } PPR, ZPR;
+};
+
+static SVEPartitions partitionSVECS(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    StackOffset PPRCalleeSavesSize,
+                                    StackOffset ZPRCalleeSavesSize,
+                                    bool IsEpilogue) {
+  MachineBasicBlock::iterator PPRsI = MBBI;
+  MachineBasicBlock::iterator End =
+      IsEpilogue ? MBB.begin() : MBB.getFirstTerminator();
+  auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; };
+  // Process the SVE CS to find the starts/ends of the ZPR and PPR areas.
+  if (PPRCalleeSavesSize) {
+    PPRsI = AdjustI(PPRsI);
+    assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction");
+    while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI)))
+      IsEpilogue ? (--PPRsI) : (++PPRsI);
+  }
+  MachineBasicBlock::iterator ZPRsI = PPRsI;
+  if (ZPRCalleeSavesSize) {
+    ZPRsI = AdjustI(ZPRsI);
+    assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction");
+    while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI)))
+      IsEpilogue ? (--ZPRsI) : (++ZPRsI);
+  }
+  if (IsEpilogue)
+    return {{PPRsI, MBBI}, {ZPRsI, PPRsI}};
+  return {{MBBI, PPRsI}, {PPRsI, ZPRsI}};
+}
+
 AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
                                                MachineBasicBlock &MBB,
                                                const AArch64FrameLowering &AFL)
@@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() {
   bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
   unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
-  // Windows unwind can't represent the required stack adjustments if we have
-  // both SVE callee-saves and dynamic stack allocations, and the frame
-  // pointer is before the SVE spills.  The allocation of the frame pointer
-  // must be the last instruction in the prologue so the unwinder can restore
-  // the stack pointer correctly. (And there isn't any unwind opcode for
-  // `addvl sp, x29, -17`.)
-  //
-  // Because of this, we do spills in the opposite order on Windows: first SVE,
-  // then GPRs. The main side-effect of this is that it makes accessing
-  // parameters passed on the stack more expensive.
-  //
-  // We could consider rearranging the spills for simpler cases.
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
-  if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
-    reportFatalUsageError("SME hazard padding is not supported on Windows");
-
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
   determineLocalsStackSize(NumBytes, PrologueSaveSize);
 
   MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
-  if (FPAfterSVECalleeSaves) {
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
     // If we're doing SVE saves first, we need to immediately allocate space
     // for fixed objects, then space for the SVE callee saves.
     //
@@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  StackOffset PPRCalleeSavesSize =
-      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
-  StackOffset ZPRCalleeSavesSize =
-      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
-  StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
-  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
-  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
-
-  std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
-      ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
-
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+  StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
   StackOffset CFAOffset =
-      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+      StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
+
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
-  if (!FPAfterSVECalleeSaves) {
-    // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
-    // areas.
-    PPRCalleeSavesBegin = AfterGPRSavesI;
-    if (PPRCalleeSavesSize) {
-      LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
-                        << PPRCalleeSavesSize.getScalable() << "\n");
-
-      assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
-             "Unexpected instruction");
-      while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
-             AfterSVESavesI != MBB.getFirstTerminator())
-        ++AfterSVESavesI;
+  // Allocate space for the callee saves and PPR locals (if any).
+  if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    auto [PPRRange, ZPRRange] =
+        partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize,
+                       ZPR.CalleeSavesSize, /*IsEpilogue=*/false);
+    AfterSVESavesI = ZPRRange.End;
+    if (EmitAsyncCFI)
+      emitCalleeSavedSVELocations(AfterSVESavesI);
+
+    StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
+    StackOffset AllocateAfterPPRs = PPR.LocalsSize;
+    if (SVELayout == SVEStackLayout::Split) {
+      AllocateBeforePPRs = PPR.CalleeSavesSize;
+      AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
     }
-    PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
-    if (ZPRCalleeSavesSize) {
-      LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
-                        << ZPRCalleeSavesSize.getScalable() << "\n");
-      assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
-             "Unexpected instruction");
-      while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
-             AfterSVESavesI != MBB.getFirstTerminator())
-        ++AfterSVESavesI;
-    }
-    ZPRCalleeSavesEnd = AfterSVESavesI;
-  }
-
-  if (EmitAsyncCFI)
-    emitCalleeSavedSVELocations(AfterSVESavesI);
-
-  if (AFI->hasSplitSVEObjects()) {
-    assert(!FPAfterSVECalleeSaves &&
-           "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
-    assert(!AFL.canUseRedZone(MF) &&
-           "Cannot use redzone with aarch64-split-sve-objects");
-    // TODO: Handle HasWinCFI/NeedsWinCFI?
-    assert(!NeedsWinCFI &&
-           "WinCFI with aarch64-split-sve-objects is not supported");
-
-    // Split ZPR and PPR allocation.
-    // Allocate PPR callee saves
-    allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+    allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
-                           ZPRLocalsSize || PPRLocalsSize);
-    CFAOffset += PPRCalleeSavesSize;
-
-    // Allocate PPR locals + ZPR callee saves
-    assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+                       MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
+                           ZPR.LocalsSize || NonSVELocalsSize);
+    CFAOffset += AllocateBeforePPRs;
+    assert(PPRRange.End == ZPRRange.Begin &&
            "Expected ZPR callee saves after PPR locals");
-    allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
-                       PPRLocalsSize + ZPRCalleeSavesSize,
-                       EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || ZPRLocalsSize);
-    CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
-
-    // Allocate ZPR locals
-    allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
-                       ZPRLocalsSize + StackOffset::getFixed(NumBytes),
+    allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects());
+                       MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
+                           NonSVELocalsSize);
+    CFAOffset += AllocateAfterPPRs;
   } else {
-    // Allocate space for the callee saves (if any).
-    StackOffset LocalsSize =
-        PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
-    if (!FPAfterSVECalleeSaves)
-      allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects() || LocalsSize);
+    assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
+    // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
+    // allocated (and separate PPR locals are not supported, all SVE locals,
+    // both PPR and ZPR, are within the ZPR locals area).
+    assert(!PPR.LocalsSize && "Unexpected PPR locals!");
     CFAOffset += SVECalleeSavesSize;
+  }
 
-    // Allocate space for the rest of the frame including SVE locals. Align the
-    // stack as necessary.
-    assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
-           "Cannot use redzone with stack realignment");
-    if (!AFL.canUseRedZone(MF)) {
-      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-      // the correct value here, as NumBytes also includes padding bytes,
-      // which shouldn't be counted here.
-      StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
-      allocateStackSpace(AfterSVESavesI, RealignmentPadding,
-                         SVELocalsSize + StackOffset::getFixed(NumBytes),
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects());
-    }
+  // Allocate space for the rest of the frame including ZPR locals. Align the
+  // stack as necessary.
+  assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+         "Cannot use redzone with stack realignment");
+  if (!AFL.canUseRedZone(MF)) {
+    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
+    // correct value here, as NumBytes also includes padding bytes, which
+    // shouldn't be counted here.
+    allocateStackSpace(
+        AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
+        EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
+  // stack pointer is at this point. Any variable size objects will be
+  // allocated after this, so we can still use the base pointer to reference
+  // locals.
   //
   // FIXME: Clarify FrameSetup flags here.
   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
@@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
         StackOffset::getScalable(MFI.getObjectOffset(FI)) -
         StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
 
-    if (AFI->hasSplitSVEObjects() &&
+    // The scalable vectors are below (lower address) the scalable predicates
+    // with split SVE objects, so we must subtract the size of the predicates.
+    if (SVELayout == SVEStackLayout::Split &&
         MFI.getStackID(FI) == TargetStackID::ScalableVector)
       Offset -= PPRStackSize;
 
@@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     return;
   }
 
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
   bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
   // Assume we can't combine the last pop with the sp restore.
   bool CombineAfterCSRBump = false;
-  if (FPAfterSVECalleeSaves) {
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
     AfterCSRPopSize += FixedObject;
   } else if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
@@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   while (FirstGPRRestoreI != Begin) {
     --FirstGPRRestoreI;
     if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
+        (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord &&
+         isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
       ++FirstGPRRestoreI;
       break;
     } else if (CombineSPBump)
@@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
-  StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
-  StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
-  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
-
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
-    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+    assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE");
 
     // When we are about to restore the CSRs, the CFA register is SP again.
     if (EmitCFI && HasFP)
@@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
-  if (!AFI->hasSplitSVEObjects()) {
-    // Process the SVE callee-saves to determine what space needs to be
-    // deallocated.
-    StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
-    MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
-                                RestoreEnd = FirstGPRRestoreI;
-    int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
-    int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
-    int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
-
-    if (SVECalleeSavedSize) {
-      if (FPAfterSVECalleeSaves)
-        RestoreEnd = MBB.getFirstTerminator();
-
-      RestoreBegin = std::prev(RestoreEnd);
-      while (RestoreBegin != MBB.begin() &&
-             isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
-        --RestoreBegin;
-
-      assert(isPartOfSVECalleeSaves(RestoreBegin) &&
-             isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
-             "Unexpected instruction");
-
-      StackOffset CalleeSavedSizeAsOffset =
-          StackOffset::getScalable(SVECalleeSavedSize);
-      DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
-      DeallocateAfter = CalleeSavedSizeAsOffset;
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  auto [PPRRange, ZPRRange] = partitionSVECS(
+      MBB,
+      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+          ? MBB.getFirstTerminator()
+          : FirstGPRRestoreI,
+      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
+  StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+  StackOffset SVEStackSize =
+      SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
+  MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
+  MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
+
+  // Deallocate the SVE area.
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
+    // If the callee-save area is before FP, restoring the FP implicitly
+    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate them
+    // explicitly.
+    if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+                      SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
+                      NeedsWinCFI, &HasWinCFI);
     }
 
-    // Deallocate the SVE area.
-    if (FPAfterSVECalleeSaves) {
-      // If the callee-save area is before FP, restoring the FP implicitly
-      // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
-      // them explicitly.
-      if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
-        emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
-                        false, NeedsWinCFI, &HasWinCFI);
-      }
+    // Deallocate callee-save non-SVE registers.
+    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
 
-      // Deallocate callee-save non-SVE registers.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
-                      TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                      &HasWinCFI);
-
-      // Deallocate fixed objects.
-      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(FixedObject), TII,
-                      MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                      &HasWinCFI);
-
-      // Deallocate callee-save SVE registers.
-      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI);
-    } else if (SVEStackSize) {
-      int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
-      // If we have stack realignment or variable-sized objects we must use the
-      // FP to restore SVE callee saves (as there is an unknown amount of
-      // data/padding between the SP and SVE CS area).
-      Register BaseForSVEDealloc =
-          (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
-                                                                : AArch64::SP;
-      if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
-        Register CalleeSaveBase = AArch64::FP;
-        if (int64_t CalleeSaveBaseOffset =
-                AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-          // If we have have an non-zero offset to the non-SVE CS base we need
-          // to compute the base address by subtracting the offest in a
-          // temporary register first (to avoid briefly deallocating the SVE
-          // CS).
-          CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-              &AArch64::GPR64RegClass);
-          emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                          StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                          MachineInstr::FrameDestroy);
-        }
-        // The code below will deallocate the stack space space by moving the
-        // SP to the start of the SVE callee-save area.
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                        StackOffset::getScalable(-SVECalleeSavedSize), TII,
+    // Deallocate fixed objects.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(FixedObject), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+    // Deallocate callee-save SVE registers.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI, &HasWinCFI);
+  } else if (AFI->hasSVEStackSize()) {
+    // If we have stack realignment or variable-sized objects we must use the FP
+    // to restore SVE callee saves (as there is an unknown amount of
+    // data/padding between the SP and SVE CS area).
+    Register BaseForSVEDealloc =
+        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                              : AArch64::SP;
+    if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
+      // TODO: Support stack realigment and variable-sized objects.
+      assert(
+          SVELayout != SVEStackLayout::Split &&
+          "unexpected stack realignment or variable sized objects with split "
+          "SVE stack objects");
+
+      Register CalleeSaveBase = AArch64::FP;
+      if (int64_t CalleeSaveBaseOffset =
+              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+        // If we have have an non-zero offset to the non-SVE CS base we need to
+        // compute the base address by subtracting the offest in a temporary
+        // register first (to avoid briefly deallocating the SVE CS).
+        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+            &AArch64::GPR64RegClass);
+        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
                         MachineInstr::FrameDestroy);
-      } else if (BaseForSVEDealloc == AArch64::SP) {
-        if (SVECalleeSavedSize) {
-          // Deallocate the non-SVE locals first before we can deallocate (and
-          // restore callee saves) from the SVE area.
-          emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                          StackOffset::getFixed(NumBytes), TII,
-                          MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                          &HasWinCFI, EmitCFI && !HasFP,
-                          SVEStackSize + StackOffset::getFixed(
-                                             NumBytes + PrologueSaveSize));
-          NumBytes = 0;
-        }
-
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
-                        false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                        SVEStackSize +
-                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
-
-        emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                        DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                        DeallocateAfter +
-                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
+      }
+      // The code below will deallocate the stack space space by moving the SP
+      // to the start of the SVE callee-save area.
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                      -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      auto CFAOffset =
+          SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+
+      if (SVECalleeSavesSize) {
+        // Deallocate the non-SVE locals first before we can deallocate (and
+        // restore callee saves) from the SVE area.
+        auto NonSVELocals = StackOffset::getFixed(NumBytes);
+        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= NonSVELocals;
+        NumBytes = 0;
       }
 
-      if (EmitCFI)
-        emitCalleeSavedSVERestores(RestoreEnd);
-    }
-  } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
-    // TODO: Support stack realigment and variable-sized objects.
-    assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
-           "unexpected stack realignment or variable sized objects with split "
-           "SVE stack objects");
-    // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
-    // areas.
-    auto ZPRCalleeSavedSize =
-        StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
-    auto PPRCalleeSavedSize =
-        StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
-    StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
-    StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
-
-    MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
-                                PPRRestoreEnd = FirstGPRRestoreI;
-    if (PPRCalleeSavedSize) {
-      PPRRestoreBegin = std::prev(PPRRestoreEnd);
-      while (PPRRestoreBegin != MBB.begin() &&
-             isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
-        --PPRRestoreBegin;
-    }
-
-    MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
-                                ZPRRestoreEnd = PPRRestoreBegin;
-    if (ZPRCalleeSavedSize) {
-      ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
-      while (ZPRRestoreBegin != MBB.begin() &&
-             isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
-        --ZPRRestoreBegin;
-    }
-
-    auto CFAOffset =
-        SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
-    if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
-      // Deallocate the non-SVE locals first before we can deallocate (and
-      // restore callee saves) from the SVE area.
-      auto NonSVELocals = StackOffset::getFixed(NumBytes);
-      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      NonSVELocals, TII, MachineInstr::FrameDestroy, false,
-                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
-      NumBytes = 0;
-      CFAOffset -= NonSVELocals;
-    }
+      if (ZPR.LocalsSize) {
+        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
+                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= ZPR.LocalsSize;
+      }
 
-    if (ZPRLocalsSize) {
-      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
-                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
-      CFAOffset -= ZPRLocalsSize;
-    }
+      StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
+      if (SVELayout == SVEStackLayout::Split &&
+          (PPR.LocalsSize || ZPR.CalleeSavesSize)) {
+        assert(PPRRange.Begin == ZPRRange.End &&
+               "Expected PPR restores after ZPR");
+        emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
+                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
+        SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
+      }
 
-    if (PPRLocalsSize || ZPRCalleeSavedSize) {
-      assert(PPRRestoreBegin == ZPRRestoreEnd &&
-             "Expected PPR restores after ZPR");
-      emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      PPRLocalsSize + ZPRCalleeSavedSize, TII,
-                      MachineInstr::FrameDestroy, false, false, nullptr,
-                      EmitCFI && !HasFP, CFAOffset);
-      CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
-    }
-    if (PPRCalleeSavedSize) {
-      emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
-                      false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+      // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
+      if (SVECalleeSavesToDealloc)
+        emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+                        SVECalleeSavesToDealloc, TII,
+                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
     }
 
-    // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
     if (EmitCFI)
-      emitCalleeSavedSVERestores(ZPRRestoreEnd);
+      emitCalleeSavedSVERestores(
+          SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End);
   }
 
   if (!HasFP) {
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index a1c9b34..bccadda 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -27,11 +27,23 @@ class AArch64Subtarget;
 class AArch64FunctionInfo;
 class AArch64FrameLowering;
 
+struct SVEFrameSizes {
+  struct {
+    StackOffset CalleeSavesSize, LocalsSize;
+  } PPR, ZPR;
+};
+
 class AArch64PrologueEpilogueCommon {
 public:
   AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
                                 const AArch64FrameLowering &AFL);
 
+  enum class SVEStackLayout {
+    Default,
+    Split,
+    CalleeSavesAboveFrameRecord,
+  };
+
 protected:
   bool requiresGetVGCall() const;
 
@@ -53,6 +65,8 @@ protected:
 
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
+  SVEFrameSizes getSVEStackFrameSizes() const;
+
   MachineFunction &MF;
   MachineBasicBlock &MBB;
 
@@ -68,6 +82,7 @@ protected:
   bool IsFunclet = false;   // Note: Set in derived constructors.
   bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup.
   bool HomPrologEpilog = false; // Note: Set in derived constructors.
+  SVEStackLayout SVELayout = SVEStackLayout::Default;
 
   // Note: "HasWinCFI" is mutable as it can change in any "emit" function.
   mutable bool HasWinCFI = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0f2c335..ce2b4a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,6 +562,11 @@ public:
 void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
 extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
 
+struct AMDGPUUniformIntrinsicCombinePass
+    : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 24bef82..8e35ba7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -27,6 +28,7 @@
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -106,6 +108,7 @@ public:
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
   mutable Function *LdexpF32 = nullptr;
+  mutable SmallVector<WeakVH> DeadVals;
 
   DenseMap<const PHINode *, bool> BreakPhiNodesCache;
 
@@ -242,6 +245,8 @@ public:
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
 
+  bool tryNarrowMathIfNoOverflow(Instruction *I);
+
 public:
   bool visitFDiv(BinaryOperator &I);
 
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() {
   BreakPhiNodesCache.clear();
   bool MadeChange = false;
 
-  Function::iterator NextBB;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
-    BasicBlock *BB = &*FI;
-    NextBB = std::next(FI);
-
-    BasicBlock::iterator Next;
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-         I = Next) {
-      Next = std::next(I);
-
-      MadeChange |= visit(*I);
-
-      if (Next != E) { // Control flow changed
-        BasicBlock *NextInstBB = Next->getParent();
-        if (NextInstBB != BB) {
-          BB = NextInstBB;
-          E = BB->end();
-          FE = F.end();
-        }
-      }
+  // Need to use make_early_inc_range because integer division expansion is
+  // handled by Transform/Utils, and it can delete instructions such as the
+  // terminator of the BB.
+  for (BasicBlock &BB : reverse(F)) {
+    for (Instruction &I : make_early_inc_range(reverse(BB))) {
+      if (!isInstructionTriviallyDead(&I, TLI))
+        MadeChange |= visit(I);
     }
   }
+
+  while (!DeadVals.empty()) {
+    if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
+      RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+  }
+
   return MadeChange;
 }
 
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
   Value *NewVal = insertValues(Builder, Ty, ResultVals);
   NewVal->takeName(&I);
   I.replaceAllUsesWith(NewVal);
-  I.eraseFromParent();
+  DeadVals.push_back(&I);
 
   return true;
 }
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
                                           FoldedT, FoldedF);
   NewSelect->takeName(&BO);
   BO.replaceAllUsesWith(NewSelect);
-  BO.eraseFromParent();
+  DeadVals.push_back(&BO);
   if (CastOp)
-    CastOp->eraseFromParent();
-  Sel->eraseFromParent();
+    DeadVals.push_back(CastOp);
+  DeadVals.push_back(Sel);
   return true;
 }
 
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
   if (NewVal) {
     FDiv.replaceAllUsesWith(NewVal);
     NewVal->takeName(&FDiv);
-    RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI);
+    DeadVals.push_back(&FDiv);
   }
 
   return true;
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`.
 We accept this change since the non-byte load assumes the upper bits
 within the byte are all 0.
 */
-static bool tryNarrowMathIfNoOverflow(Instruction *I,
-                                      const SITargetLowering *TLI,
-                                      const TargetTransformInfo &TTI,
-                                      const DataLayout &DL) {
+bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
   unsigned Opc = I->getOpcode();
   Type *OldType = I->getType();
 
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
   NewType = I->getType()->getWithNewBitWidth(NewBit);
 
   // Old cost
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
   InstructionCost OldCost =
       TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
   // New cost of new op
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
 
   Value *Zext = Builder.CreateZExt(Arith, OldType);
   I->replaceAllUsesWith(Zext);
-  I->eraseFromParent();
+  DeadVals.push_back(I);
   return true;
 }
 
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
 
   if (UseMul24Intrin && replaceMulWithMul24(I))
     return true;
-  if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
-                                TM.getTargetTransformInfo(F), DL))
+  if (tryNarrowMathIfNoOverflow(&I))
     return true;
 
   bool Changed = false;
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
 
     if (NewDiv) {
       I.replaceAllUsesWith(NewDiv);
-      I.eraseFromParent();
+      DeadVals.push_back(&I);
       Changed = true;
     }
   }
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     I.replaceAllUsesWith(ValOrig);
-    I.eraseFromParent();
+    DeadVals.push_back(&I);
     return true;
   }
 
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
 
   Fract->takeName(&I);
   I.replaceAllUsesWith(Fract);
-  RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+  DeadVals.push_back(&I);
   return true;
 }
 
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
   }
 
   I.replaceAllUsesWith(Vec);
-  I.eraseFromParent();
+  DeadVals.push_back(&I);
   return true;
 }
 
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   auto *Intrin = B.CreateIntrinsic(
       I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
   I.replaceAllUsesWith(Intrin);
-  I.eraseFromParent();
+  DeadVals.push_back(&I);
   return true;
 }
 
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
   Value *Fract = applyFractPat(Builder, FractArg);
   Fract->takeName(&I);
   I.replaceAllUsesWith(Fract);
-
-  RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+  DeadVals.push_back(&I);
   return true;
 }
 
-static bool isOneOrNegOne(const Value *Val) {
-  const APFloat *C;
-  return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
-}
-
 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
   Type *Ty = Sqrt.getType()->getScalarType();
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
   if (ReqdAccuracy < 1.0f)
     return false;
 
-  // FIXME: This is an ugly hack for this pass using forward iteration instead
-  // of reverse. If it worked like a normal combiner, the rsq would form before
-  // we saw a sqrt call.
-  auto *FDiv =
-      dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
-  if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
-      FDiv->getFPAccuracy() >= 1.0f &&
-      canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
-      // TODO: We should also handle the arcp case for the fdiv with non-1 value
-      isOneOrNegOne(FDiv->getOperand(0)))
-    return false;
-
   Value *SrcVal = Sqrt.getOperand(0);
   bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
 
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
   Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
   NewSqrt->takeName(&Sqrt);
   Sqrt.replaceAllUsesWith(NewSqrt);
-  Sqrt.eraseFromParent();
+  DeadVals.push_back(&Sqrt);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e4d328a..b8b419d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1112,8 +1112,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
         {N->getOperand(0), N->getOperand(1),
          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
   } else {
-    unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
-                                                : AMDGPU::S_USUBO_PSEUDO;
+    unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
 
     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
                          {N->getOperand(0), N->getOperand(1)});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70..a6074ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 73b2660..5407566 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -468,6 +468,38 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+  Register Lo, Hi;
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_SMIN:
+  case AMDGPU::G_SMAX: {
+    // For signed operations, use sign extension
+    auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
+    auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
+    Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+             .getReg(0);
+    Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+             .getReg(0);
+    break;
+  }
+  case AMDGPU::G_UMIN:
+  case AMDGPU::G_UMAX: {
+    // For unsigned operations, use zero extension
+    auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
+    auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
+    Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+             .getReg(0);
+    Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+             .getReg(0);
+    break;
+  }
+  default:
+    llvm_unreachable("Unpack min/max lowering not implemented");
+  }
+  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
+  MI.eraseFromParent();
+}
+
 static bool isSignedBFE(MachineInstr &MI) {
   if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
     return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -654,6 +686,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
   }
   case UnpackBitShift:
     return lowerUnpackBitShift(MI);
+  case UnpackMinMax:
+    return lowerUnpackMinMax(MI);
   case Ext32To64: {
     const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
     MachineInstrBuilder Hi;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 7affe5a..d937815 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -123,6 +123,7 @@ private:
   void lowerSplitTo32(MachineInstr &MI);
   void lowerSplitTo32Select(MachineInstr &MI);
   void lowerSplitTo32SExtInReg(MachineInstr &MI);
+  void lowerUnpackMinMax(MachineInstr &MI);
 };
 
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index f413bbc..7392f4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -522,6 +522,22 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
 
+  addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
+      .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+  addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
+      .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
   // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
   // and G_FREEZE here, rest is trivially regbankselected earlier
   addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index d0c6910..93e0efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -212,6 +212,7 @@ enum LoweringMethodID {
   VccExtToSel,
   UniExtToSel,
   UnpackBitShift,
+  UnpackMinMax,
   S_BFE,
   V_BFE,
   VgprToVccCopy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 557d87f..56807a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       //
       // vdst, srcA, srcB, srcC
       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+      bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
+                         Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
       OpdsMapping[0] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
       OpdsMapping[4] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
       break;
     }
     case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
+      Register DstReg = MI.getOperand(0).getReg();
+      unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+      unsigned MinNumRegsRequired = DstSize / 32;
+      const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
       // vdst, srcA, srcB, srcC, idx
-      OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
+                                   : getVGPROpMapping(DstReg, MRI, *TRI);
+
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
-      OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[4] =
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c7a91f4c..4958a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
     cl::desc("Whether has closed-world assumption at link time"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+    "amdgpu-enable-uniform-intrinsic-combine",
+    cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+    cl::init(true), cl::Hidden);
+
 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
+
+        if (EnableUniformIntrinsicCombine)
+          PM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerPeepholeEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 0000000..50c78d8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,159 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+/// It's true that this pass has transforms that can lead to a situation where
+/// some instruction whose operand was previously recognized as statically
+/// uniform is later on no longer recognized as statically uniform. However, the
+/// semantics of how programs execute don't (and must not, for this precise
+/// reason) care about static uniformity, they only ever care about dynamic
+/// uniformity. And every instruction that's downstream and cares about dynamic
+/// uniformity must be convergent (and isel will introduce v_readfirstlane for
+/// them if their operands can't be proven statically uniform).
+///
+/// This pass is implemented as a ModulePass because intrinsic declarations
+/// exist at the module scope, allowing us to skip processing entirely if no
+/// declarations are present and to traverse their user lists directly when
+/// they are. A FunctionPass would instead require scanning every instruction
+/// in every function to find relevant intrinsics, which is far less efficient.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Wrapper for querying uniformity info that first checks locally tracked
+/// instructions.
+static bool
+isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
+                      const ValueMap<const Value *, bool> &Tracker) {
+  Value *V = U.get();
+  if (auto It = Tracker.find(V); It != Tracker.end())
+    return !It->second; // divergent if marked false
+  return UI.isDivergentUse(U);
+}
+
+/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+                                     const UniformityInfo &UI,
+                                     ValueMap<const Value *, bool> &Tracker) {
+  llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+  switch (IID) {
+  case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    Value *Src = II.getArgOperand(0);
+    if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
+    II.replaceAllUsesWith(Src);
+    II.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    Value *Src = II.getArgOperand(0);
+    if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+      return false;
+    LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
+    bool Changed = false;
+    for (User *U : make_early_inc_range(II.users())) {
+      if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+        Value *Op0 = ICmp->getOperand(0);
+        Value *Op1 = ICmp->getOperand(1);
+        ICmpInst::Predicate Pred = ICmp->getPredicate();
+        Value *OtherOp = Op0 == &II ? Op1 : Op0;
+
+        if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+          // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
+          Instruction *NotOp =
+              BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+          Tracker[NotOp] = true; // NOT preserves uniformity
+          LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
+          ICmp->replaceAllUsesWith(NotOp);
+          ICmp->eraseFromParent();
+          Changed = true;
+        } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+          // Case: (icmp ne %ballot, 0) -> %ballot_arg
+          LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+                            << *Src << '\n');
+          ICmp->replaceAllUsesWith(Src);
+          ICmp->eraseFromParent();
+          Changed = true;
+        }
+      }
+    }
+    // Erase the intrinsic if it has no remaining uses.
+    if (II.use_empty())
+      II.eraseFromParent();
+    return Changed;
+  }
+  default:
+    llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+  }
+  return false;
+}
+
+/// Iterates over intrinsic declarations in the module to optimize their uses.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+  bool IsChanged = false;
+  ValueMap<const Value *, bool> Tracker;
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::amdgcn_permlane64:
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_ballot:
+      break;
+    default:
+      continue;
+    }
+
+    for (User *U : make_early_inc_range(F.users())) {
+      auto *II = cast<IntrinsicInst>(U);
+      Function *ParentF = II->getFunction();
+      const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+      IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
+    }
+  }
+  return IsChanged;
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!runUniformIntrinsicCombine(M, AM))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<UniformityInfoAnalysis>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56ee..13f727b68 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
+  AMDGPUUniformIntrinsicCombine.cpp
   AMDGPUInstrInfo.cpp
   AMDGPUInstructionSelector.cpp
   AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index ef63acc..71494be 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -905,7 +905,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
     OS << ":\n";
 
     SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
-    SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB);
+    SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
 
     GCNRPTracker::LiveRegSet LiveIn, LiveOut;
     GCNRegPressure RPAtMBBEnd;
@@ -931,7 +931,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
       }
     } else {
       GCNUpwardRPTracker RPT(LIS);
-      RPT.reset(MRI, MBBEndSlot);
+      RPT.reset(MRI, MBBLastSlot);
 
       LiveOut = RPT.getLiveRegs();
       RPAtMBBEnd = RPT.getPressure();
@@ -966,14 +966,14 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
 
     OS << PFX "  Live-out:" << llvm::print(LiveOut, MRI);
     if (UseDownwardTracker)
-      ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI));
+      ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBLastSlot, LIS, MRI));
 
     GCNRPTracker::LiveRegSet LiveThrough;
     for (auto [Reg, Mask] : LiveIn) {
       LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg);
       if (MaskIntersection.any()) {
         LaneBitmask LTMask = getRegLiveThroughMask(
-            MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection);
+            MRI, LIS, Reg, MBBStartSlot, MBBLastSlot, MaskIntersection);
         if (LTMask.any())
           LiveThrough[Reg] = LTMask;
       }
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a9c58bb..898d1ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -313,8 +313,8 @@ public:
 
   /// reset tracker to the end of the \p MBB.
   void reset(const MachineBasicBlock &MBB) {
-    reset(MBB.getParent()->getRegInfo(),
-          LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
+    SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
+    reset(MBB.getParent()->getRegInfo(), MBBLastSlot);
   }
 
   /// reset tracker to the point just after \p MI (in program order).
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a686a9..80e985d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -103,52 +103,52 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::Untyped, V64RegClass);
 
   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
-  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
+  addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
 
   addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
   addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
 
   addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
-  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
 
   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
-  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+  addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
 
   addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
-  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+  addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
 
   addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
-  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+  addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
 
   addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
-  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
+  addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
 
   addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
-  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
 
   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
-  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
+  addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
 
   addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
-  addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
+  addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
 
   addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
-  addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
+  addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
 
   addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
-  addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
+  addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
 
   addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
-  addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
+  addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
 
   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
-  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
   addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
-  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
+  addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
 
   addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
-  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
+  addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
 
   if (Subtarget->has16BitInsts()) {
     if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
-  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
+  addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -6073,9 +6073,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MachineOperand &Src0 = MI.getOperand(2);
     MachineOperand &Src1 = MI.getOperand(3);
     MachineOperand &Src2 = MI.getOperand(4);
-    unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
-                       ? AMDGPU::S_ADDC_U32
-                       : AMDGPU::S_SUBB_U32;
     if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
       Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6124,11 +6121,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
           .addImm(0);
     }
 
-    // clang-format off
-    BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
-        .add(Src0)
-        .add(Src1);
-    // clang-format on
+    unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
+                       ? AMDGPU::S_ADDC_U32
+                       : AMDGPU::S_SUBB_U32;
+
+    BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
 
     unsigned SelOpc =
         ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
@@ -16571,6 +16568,53 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
     }
   }
 
+  // Eliminate setcc by using carryout from add/sub instruction
+
+  // LHS = ADD i64 RHS, Z          LHSlo = UADDO       i32 RHSlo, Zlo
+  // setcc LHS ult RHS     ->      LHSHi = UADDO_CARRY i32 RHShi, Zhi
+  // similarly for subtraction
+
+  // LHS = ADD i64 Y, 1            LHSlo = UADDO       i32 Ylo, 1
+  // setcc LHS eq 0        ->      LHSHi = UADDO_CARRY i32 Yhi, 0
+
+  if (VT == MVT::i64 && ((CC == ISD::SETULT &&
+                          sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
+                         (CC == ISD::SETUGT &&
+                          sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
+                         (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
+                          sd_match(LHS, m_Add(m_Value(), m_One()))))) {
+    bool IsAdd = LHS.getOpcode() == ISD::ADD;
+
+    SDValue Op0 = LHS.getOperand(0);
+    SDValue Op1 = LHS.getOperand(1);
+
+    SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
+    SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
+
+    SDValue Op0Hi = getHiHalf64(Op0, DAG);
+    SDValue Op1Hi = getHiHalf64(Op1, DAG);
+
+    SDValue NodeLo =
+        DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
+                    DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
+
+    SDValue CarryInHi = NodeLo.getValue(1);
+    SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
+                                 SL, DAG.getVTList(MVT::i32, MVT::i1),
+                                 {Op0Hi, Op1Hi, CarryInHi});
+
+    SDValue ResultLo = NodeLo.getValue(0);
+    SDValue ResultHi = NodeHi.getValue(0);
+
+    SDValue JoinedResult =
+        DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
+
+    SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
+    SDValue Overflow = NodeHi.getValue(1);
+    DCI.CombineTo(LHS.getNode(), Result);
+    return Overflow;
+  }
+
   if (VT != MVT::f32 && VT != MVT::f64 &&
       (!Subtarget->has16BitInsts() || VT != MVT::f16))
     return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index b7dbb59..2c1a13c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1202,6 +1202,12 @@ public:
 
   unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
 
+  /// Return true if an MFMA that requires at least \p NumRegs should select to
+  /// the AGPR form, instead of the VGPR form.
+  bool selectAGPRFormMFMA(unsigned NumRegs) const {
+    return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
+  }
+
   // \returns true if a function has a use of AGPRs via inline asm or
   // has a call which may use it.
   bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7cfd059..6500fce 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
 class CanUseAGPR_MAI<ValueType vt> {
   code PredicateCode = [{
     return !Subtarget->hasGFX90AInsts() ||
-      (!SIMachineFunctionInfo::MFMAVGPRForm &&
-       MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
-  }] # !srl(vt.Size, 5) # ");";
+      MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
+        }] # !srl(vt.Size, 5) # ");";
 
   code GISelPredicateCode = [{
     return !Subtarget->hasGFX90AInsts() ||
-      (!SIMachineFunctionInfo::MFMAVGPRForm &&
-       MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+      MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
   }] # !srl(vt.Size, 5) # ");";
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2a40fb9..83c7def 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,7 +42,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 6d0529f..fb0928b8 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
       "Allow GP-relative addressing of global variables">;
 def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
       "Enable generation of duplex instruction">;
-def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true",
-      "Use unsafe FP math">;
 def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
       "true", "Reserve register R19">;
 def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
@@ -167,7 +165,6 @@ def UseHVXQFloat       : Predicate<"HST->useHVXQFloatOps()">,
 def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">;
 def HasMemNoShuf       : Predicate<"HST->hasMemNoShuf()">,
                          AssemblerPredicate<(all_of FeatureMemNoShuf)>;
-def UseUnsafeMath      : Predicate<"HST->useUnsafeMath()">;
 def NotOptTinyCore     : Predicate<"!HST->isTinyCore() ||"
                                    "MF->getFunction().hasOptSize()"> {
   let RecomputePerFunction = 1;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 4b23670..85ce944 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp,    setgt,          i1,   I64>;
 def: OpR_RR_pat<C2_cmpgtup,   setugt,         i1,   I64>;
 def: OpR_RR_pat<C2_cmpgtp,    RevCmp<setlt>,  i1,   I64>;
 def: OpR_RR_pat<C2_cmpgtup,   RevCmp<setult>, i1,   I64>;
-def: OpR_RR_pat<A2_vcmpbeq,   seteq,          i1,   V8I8>;
 def: OpR_RR_pat<A2_vcmpbeq,   seteq,          v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt,   RevCmp<setlt>,  i1,   V8I8>;
 def: OpR_RR_pat<A4_vcmpbgt,   RevCmp<setlt>,  v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt,   setgt,          i1,   V8I8>;
 def: OpR_RR_pat<A4_vcmpbgt,   setgt,          v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu,  RevCmp<setult>, i1,   V8I8>;
 def: OpR_RR_pat<A2_vcmpbgtu,  RevCmp<setult>, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu,  setugt,         i1,   V8I8>;
 def: OpR_RR_pat<A2_vcmpbgtu,  setugt,         v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpheq,   seteq,          i1,   V4I16>;
 def: OpR_RR_pat<A2_vcmpheq,   seteq,          v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt,   RevCmp<setlt>,  i1,   V4I16>;
 def: OpR_RR_pat<A2_vcmphgt,   RevCmp<setlt>,  v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt,   setgt,          i1,   V4I16>;
 def: OpR_RR_pat<A2_vcmphgt,   setgt,          v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu,  RevCmp<setult>, i1,   V4I16>;
 def: OpR_RR_pat<A2_vcmphgtu,  RevCmp<setult>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu,  setugt,         i1,   V4I16>;
 def: OpR_RR_pat<A2_vcmphgtu,  setugt,         v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmpweq,   seteq,          i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpweq,   seteq,          v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt,   RevCmp<setlt>,  i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgt,   RevCmp<setlt>,  v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt,   setgt,          i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgt,   setgt,          v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu,  RevCmp<setult>, i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  RevCmp<setult>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 
 def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
@@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r,  Shl, i32,   I32,   u5_0ImmPred>;
 def: OpR_RI_pat<S2_asr_i_p,  Sra, i64,   I64,   u6_0ImmPred>;
 def: OpR_RI_pat<S2_lsr_i_p,  Srl, i64,   I64,   u6_0ImmPred>;
 def: OpR_RI_pat<S2_asl_i_p,  Shl, i64,   I64,   u6_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>;
 
 def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>;
 def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>;
@@ -1611,8 +1590,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt),
     $Rt, $Rs),
   $Rs, $Rt)>;
 
-let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in {
-  def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
+def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{
+  return N->getFlags().hasApproximateFuncs();
+}]>;
+let Predicates = [HasV67], AddedComplexity = 50 in {
+  def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
 }
 let Predicates = [HasV67] in {
   def: OpR_RR_pat<F2_dfmin,     pf2<fminimumnum>, f64, F64>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index b111471..7430567 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool UseNewValueJumps = false;
   bool UseNewValueStores = false;
   bool UseSmallData = false;
-  bool UseUnsafeMath = false;
   bool UseZRegOps = false;
   bool UseHVXIEEEFPOps = false;
   bool UseHVXQFloatOps = false;
@@ -234,7 +233,6 @@ public:
   bool useNewValueJumps() const { return UseNewValueJumps; }
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
-  bool useUnsafeMath() const { return UseUnsafeMath; }
   bool useZRegOps() const { return UseZRegOps; }
   bool useCabac() const { return UseCabac; }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 0afa04a..f5d8b69 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
   std::string FS =
       FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
-  // Append the preexisting target features last, so that +mattr overrides
-  // the "unsafe-fp-math" function attribute.
-  // Creating a separate target feature is not strictly necessary, it only
-  // exists to make "unsafe-fp-math" force creating a new subtarget.
-
-  if (F.getFnAttribute("unsafe-fp-math").getValueAsBool())
-    FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
 
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ba70c9e..97379d7 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3677,7 +3677,7 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                       Out, STI))
       return true;
 
-    if (IsLikely) {
+    if (IsLikely && MemOffsetOp.isExpr()) {
       TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
               MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
       TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index eff80e5..21d8ded 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -855,6 +855,16 @@ def calltarget  : Operand<iPTR> {
 
 def imm64: Operand<i64>;
 
+def ConstantImmAsmOperandClass : AsmOperandClass {
+  let Name = "ConstantImm";
+  let PredicateMethod = "isConstantImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def ConstantImm64: Operand<i64> {
+  let ParserMatchClass = ConstantImmAsmOperandClass;
+}
+
 def simm19_lsl2 : Operand<i32> {
   let EncoderMethod = "getSimm19Lsl2Encoding";
   let DecoderMethod = "DecodeSimm19Lsl2";
@@ -2947,10 +2957,10 @@ def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
 
 let hasDelaySlot = 1, isCTI = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
-                               (ins imm64:$imm64, brtarget:$offset),
+                               (ins ConstantImm64:$imm64, brtarget:$offset),
                                "bne\t$rt, $imm64, $offset">;
 def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
-                               (ins imm64:$imm64, brtarget:$offset),
+                               (ins ConstantImm64:$imm64, brtarget:$offset),
                                "beq\t$rt, $imm64, $offset">;
 
 class CondBranchPseudo<string instr_asm> :
@@ -2978,7 +2988,7 @@ def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 
 let isCTI = 1 in
 class CondBranchImmPseudo<string instr_asm> :
-  MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+  MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, ConstantImm64:$imm, brtarget:$offset),
                     !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
 
 def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bc047a4a..a1fb665 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Custom conversions to/from v2i8.
   setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
 
-  // Only logical ops can be done on v4i8 directly, others must be done
+  // Only logical ops can be done on v4i8/v2i32 directly, others must be done
   // elementwise.
   setOperationAction(
       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
@@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
        ISD::USUBSAT},
-      MVT::v4i8, Expand);
+      {MVT::v4i8, MVT::v2i32}, Expand);
 
   // Operations not directly supported by NVPTX.
   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
@@ -689,7 +689,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index ecfb5fe..eb41588 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -334,7 +334,7 @@ static bool isLegalElementTypeForRVV(Type *EltTy,
   if (EltTy->isIntegerTy(64))
     return Subtarget.hasVInstructionsI64();
   if (EltTy->isHalfTy())
-    return Subtarget.hasVInstructionsF16();
+    return Subtarget.hasVInstructionsF16Minimal();
   if (EltTy->isBFloatTy())
     return Subtarget.hasVInstructionsBF16Minimal();
   if (EltTy->isFloatTy())
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 40c05e8..5ceb477 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1520,6 +1520,8 @@ def HasVendorXqcics
     : Predicate<"Subtarget->hasVendorXqcics()">,
       AssemblerPredicate<(all_of FeatureVendorXqcics),
                          "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+def NoVendorXqcics
+    : Predicate<"!Subtarget->hasVendorXqcics()">;
 
 def FeatureVendorXqcicsr
     : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
@@ -1823,6 +1825,11 @@ def TuneConditionalCompressedMoveFusion
 def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
 def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
 
+def TuneHasSingleElementVecFP64
+    : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true",
+                       "Certain vector FP64 operations produce a single result "
+                       "element per cycle">;
+
 def TuneMIPSP8700
     : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700",
                        "MIPS p8700 processor">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 447f05c..5e1d07a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1571,35 +1571,42 @@ def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>;
 }
 
 let Predicates = [HasVendorXqcicli, IsRV32] in {
-def : QCILICCPat<SETEQ,  QC_LIEQ>;
-def : QCILICCPat<SETNE,  QC_LINE>;
 def : QCILICCPat<SETLT,  QC_LILT>;
 def : QCILICCPat<SETGE,  QC_LIGE>;
 def : QCILICCPat<SETULT, QC_LILTU>;
 def : QCILICCPat<SETUGE, QC_LIGEU>;
 
-def : QCILICCIPat<SETEQ,  QC_LIEQI, simm5>;
-def : QCILICCIPat<SETNE,  QC_LINEI, simm5>;
 def : QCILICCIPat<SETLT,  QC_LILTI, simm5>;
 def : QCILICCIPat<SETGE,  QC_LIGEI, simm5>;
 def : QCILICCIPat<SETULT, QC_LILTUI, uimm5>;
 def : QCILICCIPat<SETUGE, QC_LIGEUI, uimm5>;
 
-def : QCILICCPatInv<SETNE,  QC_LIEQ>;
-def : QCILICCPatInv<SETEQ,  QC_LINE>;
 def : QCILICCPatInv<SETGE,  QC_LILT>;
 def : QCILICCPatInv<SETLT,  QC_LIGE>;
 def : QCILICCPatInv<SETUGE, QC_LILTU>;
 def : QCILICCPatInv<SETULT, QC_LIGEU>;
 
-def : QCILICCIPatInv<SETNE,  QC_LIEQI, simm5>;
-def : QCILICCIPatInv<SETEQ,  QC_LINEI, simm5>;
 def : QCILICCIPatInv<SETGE,  QC_LILTI, simm5>;
 def : QCILICCIPatInv<SETLT,  QC_LIGEI, simm5>;
 def : QCILICCIPatInv<SETUGE, QC_LILTUI, uimm5>;
 def : QCILICCIPatInv<SETULT, QC_LIGEUI, uimm5>;
 } // Predicates = [HasVendorXqcicli, IsRV32]
 
+// Prioritize Xqcics over these patterns.
+let Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] in {
+def : QCILICCPat<SETEQ,  QC_LIEQ>;
+def : QCILICCPat<SETNE,  QC_LINE>;
+
+def : QCILICCIPat<SETEQ,  QC_LIEQI, simm5>;
+def : QCILICCIPat<SETNE,  QC_LINEI, simm5>;
+
+def : QCILICCPatInv<SETNE,  QC_LIEQ>;
+def : QCILICCPatInv<SETEQ,  QC_LINE>;
+
+def : QCILICCIPatInv<SETNE,  QC_LIEQI, simm5>;
+def : QCILICCIPatInv<SETEQ,  QC_LINEI, simm5>;
+} // Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32]
+
 let Predicates = [HasVendorXqcics, IsRV32] in {
 // (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`.
 // These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern.
@@ -1636,7 +1643,7 @@ def : QCISELECTCCIPat<SETNE,  QC_SELECTNEI>;
 }
 
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
-def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
+def : Pat<(qc_setwmi (i32 GPR:$rs3), GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
           (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
 } // Predicates = [HasVendorXqcilsm, IsRV32]
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index a29b7dd..57fbaa0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -634,56 +634,56 @@ def : PatGpr<bswap, REV8_RV64, i64>;
 
 let Predicates = [HasStdExtZbkb] in {
 def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
-              (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
-              (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+              zexti8:$rs1),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl zexti8:$rs2, (XLenVT 8)),
+              zexti8:$rs1),
+          (PACKH zexti8:$rs1, zexti8:$rs2)>;
 def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
-                   (zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+                   zexti8:$rs1), 0xFFFF),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
 
 def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
-                               (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+                               zexti8:$rs1),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbkb]
 
 let Predicates = [HasStdExtZbkb, IsRV32] in {
-def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
-          (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (or zexti16:$rs1, (shl GPR:$rs2, (i32 16)))),
+          (PACK zexti16:$rs1, GPR:$rs2)>;
 
-def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
-              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i32 (or (shl GPR:$rs2, (XLenVT 24)),
+                   (shl zexti8:$rs1, (XLenVT 16)))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
 
 // Match a pattern of 2 bytes being inserted into bits [31:16], with bits
 // bits [15:0] coming from a zero extended value. We can use pack with packh for
 // bits [31:16]. If bits [15:0] can also be a packh, it can be matched
 // separately.
-def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
-                  (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
-              (zexti16 (XLenVT GPR:$rs1))),
-          (PACK (XLenVT GPR:$rs1),
-                (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(i32 (or (or (shl GPR:$op1rs2, (XLenVT 24)),
+                       (shl zexti8:$op1rs1, (XLenVT 16))),
+                   zexti16:$rs1)),
+          (PACK zexti16:$rs1,
+                (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 }
 
 let Predicates = [HasStdExtZbkb, IsRV64] in {
-def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
-          (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (or zexti32:$rs1, (shl GPR:$rs2, (i64 32)))),
+          (PACK zexti32:$rs1, GPR:$rs2)>;
 
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
-              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i64 (or (shl zexti8:$rs2, (XLenVT 24)),
+                   (shl zexti8:$rs1, (XLenVT 16)))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, zexti8:$rs2)), (XLenVT 16))>;
 def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
-                               (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+                               (shl zexti8:$rs1, (XLenVT 16))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
 
 def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
-                               (zexti16 (i64 GPR:$rs1))),
-          (PACKW GPR:$rs1, GPR:$rs2)>;
+                               zexti16:$rs1),
+          (PACKW zexti16:$rs1, GPR:$rs2)>;
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
-                   (zexti16 (i64 GPR:$rs1)))),
-          (PACKW GPR:$rs1, GPR:$rs2)>;
+                   zexti16:$rs1)),
+          (PACKW zexti16:$rs1, GPR:$rs2)>;
 
 // Match a pattern of 2 bytes being inserted into bits [31:16], with bits
 // bits [15:0] coming from a zero extended value, and bits [63:32] being
@@ -691,35 +691,35 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
 // also be a packh, it can be matched separately.
 def : Pat<(binop_allwusers<or>
                (or (shl GPR:$op1rs2, (XLenVT 24)),
-                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
-               (zexti16 (XLenVT GPR:$rs1))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+                   (shl zexti8:$op1rs1, (XLenVT 16))),
+               zexti16:$rs1),
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 // We need to manually reassociate the patterns because of the binop_allwusers.
 def : Pat<(binop_allwusers<or>
-               (or (zexti16 (XLenVT GPR:$rs1)),
-                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (or zexti16:$rs1,
+                   (shl zexti8:$op1rs1, (XLenVT 16))),
                (shl GPR:$op1rs2, (XLenVT 24))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 def : Pat<(binop_allwusers<or>
-               (or (zexti16 (XLenVT GPR:$rs1)),
-                   (shl GPR:$op1rs1, (XLenVT 24))),
-               (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+               (or zexti16:$rs1,
+                   (shl GPR:$op1rs2, (XLenVT 24))),
+               (shl zexti8:$op1rs1, (XLenVT 16))),
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 
 def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
-                       (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
-                   (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+                       (shl zexti8:$op1rs1, (XLenVT 16))),
+                   (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 
 // Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
 // bits [31:0] coming from a zero extended value. We can use pack with packw for
 // bits [63:32]. If bits [63:31] can also be a packw, it can be matched
 // separately.
 def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)),
-                  (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))),
-              (zexti32 (i64 GPR:$rs1))),
-          (PACK (XLenVT GPR:$rs1),
-                (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>;
+                  (shl zexti16:$op1rs1, (i64 32))),
+              zexti32:$rs1),
+          (PACK zexti32:$rs1,
+                (XLenVT (PACKW zexti16:$op1rs1, GPR:$op1rs2)))>;
 } // Predicates = [HasStdExtZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 6d86aff..3658817 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -14,6 +14,10 @@
 // otherwise.
 def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
 
+// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64
+// is enabled.
+def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>;
+
 // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
 def isSEXT_W
     : TIIPredicate<"isSEXT_W",
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 17a7948..e86431f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
                                        FeatureStdExtZvl1024b,
                                        FeatureVendorXSiFivecdiscarddlone,
                                        FeatureVendorXSiFivecflushdlone],
-                                      SiFiveIntelligenceTuneFeatures>;
+                                      !listconcat(SiFiveIntelligenceTuneFeatures,
+                                                  [TuneHasSingleElementVecFP64])>;
 
 defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
                                  TuneConditionalCompressedMoveFusion,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3e07eff..f863392a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN,
     ProcResourceKind VL, ProcResourceKind VS,
     ProcResourceKind VCQ,
     SiFive7FPLatencies fpLatencies,
-    bit isFP64Throttled = false,
     bit hasFastGather = false> {
 
   // Branching
@@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN,
   // 13. Vector Floating-Point Instructions
   foreach mx = SchedMxListF in {
     foreach sew = SchedSEWSet<mx, isF=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
-      defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
-      defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
-      let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
-      }
-      defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
-      let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA], mx, sew, IsWorstCase>;
-        // min max require merge
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+      if !eq(sew, 64) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF",
+                                  "WriteVFMulAddV", "WriteVFMulAddF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
+        let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          // min max require merge
+          defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
       }
     }
   }
@@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN,
   // Widening
   foreach mx = SchedMxListW in {
     foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+        defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      }
     }
   }
   foreach mx = SchedMxListFW in {
     foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
         defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
@@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN,
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
       }
-      defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+        defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      }
     }
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
@@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN,
   }
   foreach mx = SchedMxListFW in {
     foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesNarrowing<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
       }
     }
   }
@@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance {
 /// eventually be supplied by different SchedMachineModels.
 multiclass SiFive7SchedResources<int vlen, bit extraVALU,
                                  SiFive7FPLatencies fpLatencies,
-                                 bit isFP64Throttled,
                                  bit hasFastGather> {
   defm SiFive7 : SiFive7ProcResources<extraVALU>;
 
@@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
       : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
                             SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
                             SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
-                            SiFive7VCQ, fpLatencies, isFP64Throttled,
-                            hasFastGather>;
+                            SiFive7VCQ, fpLatencies, hasFastGather>;
 
   //===----------------------------------------------------------------------===//
   // Bypass and advance
@@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
   bit HasExtraVALU = false;
 
   SiFive7FPLatencies FPLatencies;
-  bit IsFP64Throttled = false;
   bit HasFastGather = false;
 
   string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
@@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
 def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
   let HasExtraVALU = true;
   let FPLatencies = SiFive7LowFPLatencies;
-  let IsFP64Throttled = true;
   let HasFastGather = true;
 }
 
@@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
   let SchedModel = model in
   defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
                                           model.FPLatencies,
-                                          model.IsFP64Throttled,
                                           model.HasFastGather>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 01a4308..d11b446 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
                                     IsWorstCase>;
 }
 
+multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred,
+                                       list<ProcResourceKind> predResources,
+                                       int predLat, list<int> predAcquireCycles,
+                                       list<int> predReleaseCycles,
+                                       list<ProcResourceKind> noPredResources,
+                                       int noPredLat, list<int> noPredAcquireCycles,
+                                       list<int> noPredReleaseCycles,
+                                       string mx, int sew, bit IsWorstCase> {
+  defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources,
+                                    predLat, predAcquireCycles,
+                                    predReleaseCycles, noPredResources,
+                                    noPredLat, noPredAcquireCycles,
+                                    noPredReleaseCycles,
+                                    IsWorstCase>;
+}
+
 // Define multiclasses to define SchedWrite, SchedRead,  WriteRes, and
 // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
 // SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index e8c849e..28a1690 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -46,7 +46,6 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 20f03b0..60d39c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index 278ad7c..e621bcd44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -14,7 +14,6 @@
 #include "SPIRV.h"
 #include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 1811492..5b149f8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index af79070..275165d 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -184,8 +184,8 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
   // Output the TLS marker if present.
   if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
     const MCOperand &MO = MI->getOperand(OpNum + 1);
-    const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
-    switch (refExp.getSpecifier()) {
+    const MCSymbolRefExpr &RefExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+    switch (RefExp.getSpecifier()) {
     case SystemZ::S_TLSGD:
       O << ":tls_gdcall:";
       break;
@@ -195,7 +195,7 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
     default:
       llvm_unreachable("Unexpected symbol kind");
     }
-    O << refExp.getSymbol().getName();
+    O << RefExp.getSymbol().getName();
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index fce6393..8c31579 100644
--- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -13,10 +13,9 @@
 
 using namespace llvm;
 
-SystemZConstantPoolValue::
-SystemZConstantPoolValue(const GlobalValue *gv,
-                         SystemZCP::SystemZCPModifier modifier)
-  : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+SystemZConstantPoolValue::SystemZConstantPoolValue(
+    const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier)
+    : MachineConstantPoolValue(GV->getType()), GV(GV), Modifier(Modifier) {}
 
 SystemZConstantPoolValue *
 SystemZConstantPoolValue::Create(const GlobalValue *GV,
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 34d58e0..5313fba 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -352,10 +352,9 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
   // Similarly, a group-ending SU may either fit well (last in group), or
   // end the group prematurely.
   if (SC->EndGroup) {
-    unsigned resultingGroupSize =
-      (CurrGroupSize + getNumDecoderSlots(SU));
-    if (resultingGroupSize < 3)
-      return (3 - resultingGroupSize);
+    unsigned ResultingGroupSize = (CurrGroupSize + getNumDecoderSlots(SU));
+    if (ResultingGroupSize < 3)
+      return (3 - ResultingGroupSize);
     return -1;
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9580ade..1cfcb1f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 3bc46af..6dd43b2 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -547,7 +547,7 @@ unsigned X86TargetLowering::getAddressSpace() const {
 
 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
-         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+         TargetTriple.isAndroid();
 }
 
 static Constant* SegmentOffset(IRBuilderBase &IRB,
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 764ff998..4b3ddbd 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -592,10 +592,10 @@ def : Pat<(X86mcvttp2sis (v2f64 (X86VBroadcastld64 addr:$src)),
           (VCVTTPD2DQSZ128rmbkz VK2WM:$mask, addr:$src)>;
 
 // Patterns VCVTTPD2UDQSZ128
-def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
-          (VCVTTPD2UDQSZ128rmb addr:$src)>;
 def : Pat<(v4i32 (X86cvttp2uis (v2f64 VR128X:$src))),
           (VCVTTPD2UDQSZ128rr VR128X:$src)>;
+def : Pat<(v4i32 (X86cvttp2uis (loadv2f64 addr:$src))),
+          (VCVTTPD2UDQSZ128rm addr:$src)>;
 def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
           (VCVTTPD2UDQSZ128rmb addr:$src)>;
 def : Pat<(X86mcvttp2uis (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 805bdb4..bbbac45 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -28,8 +28,12 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -39,6 +43,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
 STATISTIC(NumGuardedRotates,
           "Number of guarded rotates transformed into funnel shifts");
@@ -599,6 +607,14 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
     auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
     auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
 
+    // The true branch of select handles the cttz(0) case, which is rare.
+    if (!ProfcheckDisableMetadataFixes) {
+      if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+        SelectI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
+    }
+
     // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
     // it should be handled as: `cttz(x) & (typeSize - 1)`.
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 9b9e2ba..9150b58 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -459,7 +459,7 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
       Value *Op0 = I->getOperand(0);
       Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
-      Res = Builder.CreateSelect(Op0, LHS, RHS);
+      Res = Builder.CreateSelect(Op0, LHS, RHS, "", I);
       break;
     }
     case Instruction::PHI: {
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index 9115946..f166fef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -24,6 +24,9 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -33,6 +36,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "coro-annotation-elide"
 
+static cl::opt<float> CoroElideBranchRatio(
+    "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden,
+    cl::desc("Minimum BranchProbability to consider a elide a coroutine."));
+extern cl::opt<unsigned> MinBlockCounterExecution;
+
 static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
   for (Instruction &I : F->getEntryBlock())
     if (!isa<AllocaInst>(&I))
@@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
+        BranchProbability MinBranchProbability(
+            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
+            MinBlockCounterExecution);
+
+        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+        auto Prob = BranchProbability::getBranchProbability(
+            BFI.getBlockFreq(CB->getParent()).getFrequency(),
+            BFI.getEntryFreq().getFrequency());
+
+        if (Prob < MinBranchProbability) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(
+                       DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
+                   << "'" << ore::NV("callee", Callee->getName())
+                   << "' not elided in '"
+                   << ore::NV("caller", Caller->getName())
+                   << "' because of low probability: "
+                   << ore::NV("probability", Prob) << " (threshold: "
+                   << ore::NV("threshold", MinBranchProbability) << ")";
+          });
+          continue;
+        }
+
         auto *CallerN = CG.lookup(*Caller);
         auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr;
         // If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller
@@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "'";
+                 << "' (probability: " << ore::NV("probability", Prob) << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index cfdfd94..5066a99 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1033,19 +1033,17 @@ private:
 };
 } // namespace
 
-namespace llvm {
 template <>
-struct DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<typename CallsiteContextGraph<
     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
 template <>
-struct DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<typename CallsiteContextGraph<
     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
 template <>
-struct DenseMapInfo<IndexCall>
+struct llvm::DenseMapInfo<IndexCall>
     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
-} // end namespace llvm
 
 namespace {
 
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2583249..1a00d17 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio(
              "outline candidate and original function"));
 // Used to tune the minimum number of execution counts needed in the predecessor
 // block to the cold edge. ie. confidence interval.
-static cl::opt<unsigned>
+cl::opt<unsigned>
     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
                              cl::desc("Minimum block executions to consider "
                                       "its BranchProbabilityInfo valid"));
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index ac41fdd..2d5cb82 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -372,9 +372,7 @@ struct VTableSlot {
 
 } // end anonymous namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<VTableSlot> {
+template <> struct llvm::DenseMapInfo<VTableSlot> {
   static VTableSlot getEmptyKey() {
     return {DenseMapInfo<Metadata *>::getEmptyKey(),
             DenseMapInfo<uint64_t>::getEmptyKey()};
@@ -393,7 +391,7 @@ template <> struct DenseMapInfo<VTableSlot> {
   }
 };
 
-template <> struct DenseMapInfo<VTableSlotSummary> {
+template <> struct llvm::DenseMapInfo<VTableSlotSummary> {
   static VTableSlotSummary getEmptyKey() {
     return {DenseMapInfo<StringRef>::getEmptyKey(),
             DenseMapInfo<uint64_t>::getEmptyKey()};
@@ -412,8 +410,6 @@ template <> struct DenseMapInfo<VTableSlotSummary> {
   }
 };
 
-} // end namespace llvm
-
 // Returns true if the function must be unreachable based on ValueInfo.
 //
 // In particular, identifies a function as unreachable in the following
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 9b272c4..3ddf182 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -28,6 +28,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
 /// This is the complement of getICmpCode, which turns an opcode and two
 /// operands into either a constant true or false, or a brand new ICmp
 /// instruction. The sign is passed in to determine which kind of predicate to
@@ -1272,7 +1276,8 @@ Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) {
 static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                           bool IsAnd, bool IsLogical,
                                           InstCombiner::BuilderTy &Builder,
-                                          const SimplifyQuery &Q) {
+                                          const SimplifyQuery &Q,
+                                          Instruction &I) {
   // Match an equality compare with a non-poison constant as Cmp0.
   // Also, give up if the compare can be constant-folded to avoid looping.
   CmpPredicate Pred0;
@@ -1306,9 +1311,12 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
       return nullptr;
     SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
   }
-  if (IsLogical)
-    return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp)
-                 : Builder.CreateLogicalOr(Cmp0, SubstituteCmp);
+  if (IsLogical) {
+    Instruction *MDFrom =
+        ProfcheckDisableMetadataFixes && isa<SelectInst>(I) ? nullptr : &I;
+    return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp, "", MDFrom)
+                 : Builder.CreateLogicalOr(Cmp0, SubstituteCmp, "", MDFrom);
+  }
   return Builder.CreateBinOp(IsAnd ? Instruction::And : Instruction::Or, Cmp0,
                              SubstituteCmp);
 }
@@ -3396,13 +3404,13 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                                   /*IsLogical*/ false, Builder))
     return V;
 
-  if (Value *V =
-          foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical, Builder, Q))
+  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical,
+                                             Builder, Q, I))
     return V;
   // We can convert this case to bitwise and, because both operands are used
   // on the LHS, and as such poison from both will propagate.
-  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
-                                             /*IsLogical=*/false, Builder, Q)) {
+  if (Value *V = foldAndOrOfICmpsWithConstEq(
+          RHS, LHS, IsAnd, /*IsLogical=*/false, Builder, Q, I)) {
     // If RHS is still used, we should drop samesign flag.
     if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
       RHS->setSameSign(false);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 56194fe..4c9b10a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2202,6 +2202,11 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
   return commonCastTransforms(CI);
 }
 
+Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+  // FIXME: Implement variants of ptrtoint folds.
+  return commonCastTransforms(CI);
+}
+
 /// This input value (which is known to have vector type) is being zero extended
 /// or truncated to the specified vector type. Since the zext/trunc is done
 /// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index e01c145..218aaf9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -143,6 +143,7 @@ public:
   Instruction *visitUIToFP(CastInst &CI);
   Instruction *visitSIToFP(CastInst &CI);
   Instruction *visitPtrToInt(PtrToIntInst &CI);
+  Instruction *visitPtrToAddr(PtrToAddrInst &CI);
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
   Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 5c747bb..9815644 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1069,27 +1069,22 @@ struct LoweredPHIRecord {
 };
 } // namespace
 
-namespace llvm {
-  template<>
-  struct DenseMapInfo<LoweredPHIRecord> {
-    static inline LoweredPHIRecord getEmptyKey() {
-      return LoweredPHIRecord(nullptr, 0);
-    }
-    static inline LoweredPHIRecord getTombstoneKey() {
-      return LoweredPHIRecord(nullptr, 1);
-    }
-    static unsigned getHashValue(const LoweredPHIRecord &Val) {
-      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
-             (Val.Width>>3);
-    }
-    static bool isEqual(const LoweredPHIRecord &LHS,
-                        const LoweredPHIRecord &RHS) {
-      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
-             LHS.Width == RHS.Width;
-    }
-  };
-} // namespace llvm
-
+template <> struct llvm::DenseMapInfo<LoweredPHIRecord> {
+  static inline LoweredPHIRecord getEmptyKey() {
+    return LoweredPHIRecord(nullptr, 0);
+  }
+  static inline LoweredPHIRecord getTombstoneKey() {
+    return LoweredPHIRecord(nullptr, 1);
+  }
+  static unsigned getHashValue(const LoweredPHIRecord &Val) {
+    return DenseMapInfo<PHINode *>::getHashValue(Val.PN) ^ (Val.Shift >> 3) ^
+           (Val.Width >> 3);
+  }
+  static bool isEqual(const LoweredPHIRecord &LHS,
+                      const LoweredPHIRecord &RHS) {
+    return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && LHS.Width == RHS.Width;
+  }
+};
 
 /// This is an integer PHI and we know that it has an illegal type: see if it is
 /// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 3704ad7..860f8f7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -600,9 +600,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
                            !IsRISCV64 && !IsLoongArch64 &&
                            !(Mapping.Offset & (Mapping.Offset - 1)) &&
                            Mapping.Offset != kDynamicShadowSentinel;
-  bool IsAndroidWithIfuncSupport =
-      IsAndroid && !TargetTriple.isAndroidVersionLT(21);
-  Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
+  Mapping.InGlobal = ClWithIfunc && IsAndroid && IsArmOrThumb;
 
   return Mapping;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b8ea15..b74a070 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     auto ThenTerm = SplitBlockAndInsertIfThen(
         IRB.CreateIsNull(Load), &*IP, false,
         MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
-    IRBuilder<> ThenIRB(ThenTerm);
+    InstrumentationIRBuilder ThenIRB(ThenTerm);
     auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+    if (EntryLoc)
+      Store->setDebugLoc(EntryLoc);
     Load->setNoSanitizeMetadata();
     Store->setNoSanitizeMetadata();
   }
@@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
           EstimatedStackSize >= Options.StackDepthCallbackMin) {
         if (InsertBefore)
           IRB.SetInsertPoint(InsertBefore);
-        IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge();
+        auto Call = IRB.CreateCall(SanCovStackDepthCallback);
+        if (EntryLoc)
+          Call->setDebugLoc(EntryLoc);
+        Call->setCannotMerge();
       }
     } else {
       // Check stack depth.  If it's the deepest so far, record it.
@@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
       auto ThenTerm = SplitBlockAndInsertIfThen(
           IsStackLower, &*IP, false,
           MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
-      IRBuilder<> ThenIRB(ThenTerm);
+      InstrumentationIRBuilder ThenIRB(ThenTerm);
       auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+      if (EntryLoc)
+        Store->setDebugLoc(EntryLoc);
       LowestStack->setNoSanitizeMetadata();
       Store->setNoSanitizeMetadata();
     }
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e448230..e5935f4 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -61,6 +61,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
@@ -147,19 +148,16 @@ public:
 
 class DFAJumpThreading {
 public:
-  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+  DFAJumpThreading(AssumptionCache *AC, DomTreeUpdater *DTU, LoopInfo *LI,
                    TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE)
-      : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DTU(DTU), LI(LI), TTI(TTI), ORE(ORE) {}
 
   bool run(Function &F);
   bool LoopInfoBroken;
 
 private:
   void
-  unfoldSelectInstrs(DominatorTree *DT,
-                     const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
-    // TODO: Have everything use a single lazy DTU
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  unfoldSelectInstrs(const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
     SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
 
     while (!Stack.empty()) {
@@ -167,7 +165,7 @@ private:
 
       std::vector<SelectInstToUnfold> NewSIsToUnfold;
       std::vector<BasicBlock *> NewBBs;
-      unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
+      unfold(DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
 
       // Put newly discovered select instructions into the work list.
       llvm::append_range(Stack, NewSIsToUnfold);
@@ -180,7 +178,7 @@ private:
                      std::vector<BasicBlock *> *NewBBs);
 
   AssumptionCache *AC;
-  DominatorTree *DT;
+  DomTreeUpdater *DTU;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   OptimizationRemarkEmitter *ORE;
@@ -382,19 +380,28 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap;
 typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap;
 
 inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
-  OS << "< ";
-  for (const BasicBlock *BB : Path) {
-    std::string BBName;
-    if (BB->hasName())
-      raw_string_ostream(BBName) << BB->getName();
-    else
-      raw_string_ostream(BBName) << BB;
-    OS << BBName << " ";
-  }
-  OS << ">";
+  auto BBNames = llvm::map_range(
+      Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); });
+  OS << "< " << llvm::join(BBNames, ", ") << " >";
   return OS;
 }
 
+/// Helper to get the successor corresponding to a particular case value for
+/// a switch statement.
+static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch,
+                                        const APInt &NextState) {
+  BasicBlock *NextCase = nullptr;
+  for (auto Case : Switch->cases()) {
+    if (Case.getCaseValue()->getValue() == NextState) {
+      NextCase = Case.getCaseSuccessor();
+      break;
+    }
+  }
+  if (!NextCase)
+    NextCase = Switch->getDefaultDest();
+  return NextCase;
+}
+
 namespace {
 /// ThreadingPath is a path in the control flow of a loop that can be threaded
 /// by cloning necessary basic blocks and replacing conditional branches with
@@ -407,6 +414,10 @@ struct ThreadingPath {
     ExitVal = V->getValue();
     IsExitValSet = true;
   }
+  void setExitValue(const APInt &V) {
+    ExitVal = V;
+    IsExitValSet = true;
+  }
   bool isExitValueSet() const { return IsExitValSet; }
 
   /// Determinator is the basic block that determines the next state of the DFA.
@@ -423,7 +434,7 @@ struct ThreadingPath {
   }
 
   void print(raw_ostream &OS) const {
-    OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]";
+    OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]";
   }
 
 private:
@@ -589,44 +600,8 @@ struct AllSwitchPaths {
   BasicBlock *getSwitchBlock() { return SwitchBlock; }
 
   void run() {
-    StateDefMap StateDef = getStateDefMap();
-    if (StateDef.empty()) {
-      ORE->emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
-                                        Switch)
-               << "Switch instruction is not predictable.";
-      });
-      return;
-    }
-
-    auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
-    auto *SwitchPhiDefBB = SwitchPhi->getParent();
-    VisitedBlocks VB;
-    // Get paths from the determinator BBs to SwitchPhiDefBB
-    std::vector<ThreadingPath> PathsToPhiDef =
-        getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
-    if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
-      TPaths = std::move(PathsToPhiDef);
-      return;
-    }
-
-    assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
-    auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
-    // Find and append paths from SwitchPhiDefBB to SwitchBlock.
-    PathsType PathsToSwitchBB =
-        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
-    if (PathsToSwitchBB.empty())
-      return;
-
-    std::vector<ThreadingPath> TempList;
-    for (const ThreadingPath &Path : PathsToPhiDef) {
-      for (const PathType &PathToSw : PathsToSwitchBB) {
-        ThreadingPath PathCopy(Path);
-        PathCopy.appendExcludingFirst(PathToSw);
-        TempList.push_back(PathCopy);
-      }
-    }
-    TPaths = std::move(TempList);
+    findTPaths();
+    unifyTPaths();
   }
 
 private:
@@ -818,6 +793,69 @@ private:
     return Res;
   }
 
+  // Find all threadable paths.
+  void findTPaths() {
+    StateDefMap StateDef = getStateDefMap();
+    if (StateDef.empty()) {
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
+                                        Switch)
+               << "Switch instruction is not predictable.";
+      });
+      return;
+    }
+
+    auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
+    auto *SwitchPhiDefBB = SwitchPhi->getParent();
+    VisitedBlocks VB;
+    // Get paths from the determinator BBs to SwitchPhiDefBB
+    std::vector<ThreadingPath> PathsToPhiDef =
+        getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+    if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
+      TPaths = std::move(PathsToPhiDef);
+      return;
+    }
+
+    assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+    auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
+    // Find and append paths from SwitchPhiDefBB to SwitchBlock.
+    PathsType PathsToSwitchBB =
+        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
+    if (PathsToSwitchBB.empty())
+      return;
+
+    std::vector<ThreadingPath> TempList;
+    for (const ThreadingPath &Path : PathsToPhiDef) {
+      for (const PathType &PathToSw : PathsToSwitchBB) {
+        ThreadingPath PathCopy(Path);
+        PathCopy.appendExcludingFirst(PathToSw);
+        TempList.push_back(PathCopy);
+      }
+    }
+    TPaths = std::move(TempList);
+  }
+
+  // Two states are equivalent if they have the same switch destination.
+  // Unify the states in different threading path if the states are equivalent.
+  void unifyTPaths() {
+    llvm::SmallDenseMap<BasicBlock *, APInt> DestToState;
+    for (ThreadingPath &Path : TPaths) {
+      APInt NextState = Path.getExitValue();
+      BasicBlock *Dest = getNextCaseSuccessor(Switch, NextState);
+      auto StateIt = DestToState.find(Dest);
+      if (StateIt == DestToState.end()) {
+        DestToState.insert({Dest, NextState});
+        continue;
+      }
+
+      if (NextState != StateIt->second) {
+        LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to "
+                          << StateIt->second << "\n");
+        Path.setExitValue(StateIt->second);
+      }
+    }
+  }
+
   unsigned NumVisited = 0;
   SwitchInst *Switch;
   BasicBlock *SwitchBlock;
@@ -828,11 +866,11 @@ private:
 };
 
 struct TransformDFA {
-  TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT,
+  TransformDFA(AllSwitchPaths *SwitchPaths, DomTreeUpdater *DTU,
                AssumptionCache *AC, TargetTransformInfo *TTI,
                OptimizationRemarkEmitter *ORE,
                SmallPtrSet<const Value *, 32> EphValues)
-      : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE),
+      : SwitchPaths(SwitchPaths), DTU(DTU), AC(AC), TTI(TTI), ORE(ORE),
         EphValues(EphValues) {}
 
   bool run() {
@@ -1008,19 +1046,16 @@ private:
     SmallPtrSet<BasicBlock *, 16> BlocksToClean;
     BlocksToClean.insert_range(successors(SwitchBlock));
 
-    {
-      DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
-      for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
-        createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
-        NumPaths++;
-      }
-
-      // After all paths are cloned, now update the last successor of the cloned
-      // path so it skips over the switch statement
-      for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
-        updateLastSuccessor(TPath, DuplicateMap, &DTU);
+    for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+      createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, DTU);
+      NumPaths++;
     }
 
+    // After all paths are cloned, now update the last successor of the cloned
+    // path so it skips over the switch statement
+    for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
+      updateLastSuccessor(TPath, DuplicateMap, DTU);
+
     // For each instruction that was cloned and used outside, update its uses
     updateSSA(NewDefs);
 
@@ -1124,7 +1159,7 @@ private:
     }
     // SSAUpdater handles phi placement and renaming uses with the appropriate
     // value.
-    SSAUpdate.RewriteAllUses(DT);
+    SSAUpdate.RewriteAllUses(&DTU->getDomTree());
   }
 
   /// Clones a basic block, and adds it to the CFG.
@@ -1341,28 +1376,13 @@ private:
     return It != ClonedBBs.end() ? (*It).BB : nullptr;
   }
 
-  /// Helper to get the successor corresponding to a particular case value for
-  /// a switch statement.
-  BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) {
-    BasicBlock *NextCase = nullptr;
-    for (auto Case : Switch->cases()) {
-      if (Case.getCaseValue()->getValue() == NextState) {
-        NextCase = Case.getCaseSuccessor();
-        break;
-      }
-    }
-    if (!NextCase)
-      NextCase = Switch->getDefaultDest();
-    return NextCase;
-  }
-
   /// Returns true if IncomingBB is a predecessor of BB.
   bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) {
     return llvm::is_contained(predecessors(BB), IncomingBB);
   }
 
   AllSwitchPaths *SwitchPaths;
-  DominatorTree *DT;
+  DomTreeUpdater *DTU;
   AssumptionCache *AC;
   TargetTransformInfo *TTI;
   OptimizationRemarkEmitter *ORE;
@@ -1405,7 +1425,7 @@ bool DFAJumpThreading::run(Function &F) {
                       << "candidate for jump threading\n");
     LLVM_DEBUG(SI->dump());
 
-    unfoldSelectInstrs(DT, Switch.getSelectInsts());
+    unfoldSelectInstrs(Switch.getSelectInsts());
     if (!Switch.getSelectInsts().empty())
       MadeChanges = true;
 
@@ -1427,7 +1447,7 @@ bool DFAJumpThreading::run(Function &F) {
   }
 
 #ifdef NDEBUG
-  LI->verify(*DT);
+  LI->verify(DTU->getDomTree());
 #endif
 
   SmallPtrSet<const Value *, 32> EphValues;
@@ -1435,13 +1455,15 @@ bool DFAJumpThreading::run(Function &F) {
     CodeMetrics::collectEphemeralValues(&F, AC, EphValues);
 
   for (AllSwitchPaths SwitchPaths : ThreadableLoops) {
-    TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
+    TransformDFA Transform(&SwitchPaths, DTU, AC, TTI, ORE, EphValues);
     if (Transform.run())
       MadeChanges = LoopInfoBroken = true;
   }
 
+  DTU->flush();
+
 #ifdef EXPENSIVE_CHECKS
-  assert(DT->verify(DominatorTree::VerificationLevel::Full));
+  assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full));
   verifyFunction(F, &dbgs());
 #endif
 
@@ -1456,7 +1478,9 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter ORE(&F);
-  DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE);
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  DFAJumpThreading ThreadImpl(&AC, &DTU, &LI, &TTI, &ORE);
   if (!ThreadImpl.run(F))
     return PreservedAnalyses::all();
 
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 0f8cc6c..2afa7b7 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -108,7 +108,7 @@ struct SimpleValue {
     // of instruction handled below (UnaryOperator, etc.).
     if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
       if (Function *F = CI->getCalledFunction()) {
-        switch ((Intrinsic::ID)F->getIntrinsicID()) {
+        switch (F->getIntrinsicID()) {
         case Intrinsic::experimental_constrained_fadd:
         case Intrinsic::experimental_constrained_fsub:
         case Intrinsic::experimental_constrained_fmul:
@@ -154,9 +154,7 @@ struct SimpleValue {
 
 } // end anonymous namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<SimpleValue> {
+template <> struct llvm::DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction *>::getEmptyKey();
   }
@@ -169,8 +167,6 @@ template <> struct DenseMapInfo<SimpleValue> {
   static bool isEqual(SimpleValue LHS, SimpleValue RHS);
 };
 
-} // end namespace llvm
-
 /// Match a 'select' including an optional 'not's of the condition.
 static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
                                            Value *&B,
@@ -509,9 +505,7 @@ struct CallValue {
 
 } // end anonymous namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<CallValue> {
+template <> struct llvm::DenseMapInfo<CallValue> {
   static inline CallValue getEmptyKey() {
     return DenseMapInfo<Instruction *>::getEmptyKey();
   }
@@ -524,8 +518,6 @@ template <> struct DenseMapInfo<CallValue> {
   static bool isEqual(CallValue LHS, CallValue RHS);
 };
 
-} // end namespace llvm
-
 unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   Instruction *Inst = Val.Inst;
 
@@ -580,9 +572,7 @@ struct GEPValue {
 
 } // namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<GEPValue> {
+template <> struct llvm::DenseMapInfo<GEPValue> {
   static inline GEPValue getEmptyKey() {
     return DenseMapInfo<Instruction *>::getEmptyKey();
   }
@@ -595,8 +585,6 @@ template <> struct DenseMapInfo<GEPValue> {
   static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
 };
 
-} // end namespace llvm
-
 unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
   auto *GEP = cast<GetElementPtrInst>(Val.Inst);
   if (Val.ConstantOffset.has_value())
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 638952a..3a8ade8 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -170,9 +170,7 @@ struct llvm::GVNPass::Expression {
   }
 };
 
-namespace llvm {
-
-template <> struct DenseMapInfo<GVNPass::Expression> {
+template <> struct llvm::DenseMapInfo<GVNPass::Expression> {
   static inline GVNPass::Expression getEmptyKey() { return ~0U; }
   static inline GVNPass::Expression getTombstoneKey() { return ~1U; }
 
@@ -188,8 +186,6 @@ template <> struct DenseMapInfo<GVNPass::Expression> {
   }
 };
 
-} // end namespace llvm
-
 /// Represents a particular available value that we know how to materialize.
 /// Materialization of an AvailableValue never fails.  An AvailableValue is
 /// implicitly associated with a rematerialization point which is the
@@ -2084,13 +2080,6 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) {
   return Changed;
 }
 
-static bool hasUsersIn(Value *V, BasicBlock *BB) {
-  return any_of(V->users(), [BB](User *U) {
-    auto *I = dyn_cast<Instruction>(U);
-    return I && I->getParent() == BB;
-  });
-}
-
 bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
   Value *V = IntrinsicI->getArgOperand(0);
 
@@ -2149,85 +2138,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
   }
 
   Constant *True = ConstantInt::getTrue(V->getContext());
-  bool Changed = false;
-
-  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
-    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
-
-    // This property is only true in dominated successors, propagateEquality
-    // will check dominance for us.
-    Changed |= propagateEquality(V, True, Edge, false);
-  }
-
-  // We can replace assume value with true, which covers cases like this:
-  // call void @llvm.assume(i1 %cmp)
-  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
-  ReplaceOperandsWithMap[V] = True;
-
-  // Similarly, after assume(!NotV) we know that NotV == false.
-  Value *NotV;
-  if (match(V, m_Not(m_Value(NotV))))
-    ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
-
-  // If we find an equality fact, canonicalize all dominated uses in this block
-  // to one of the two values.  We heuristically choice the "oldest" of the
-  // two where age is determined by value number. (Note that propagateEquality
-  // above handles the cross block case.)
-  //
-  // Key case to cover are:
-  // 1)
-  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
-  // call void @llvm.assume(i1 %cmp)
-  // ret float %0 ; will change it to ret float 3.000000e+00
-  // 2)
-  // %load = load float, float* %addr
-  // %cmp = fcmp oeq float %load, %0
-  // call void @llvm.assume(i1 %cmp)
-  // ret float %load ; will change it to ret float %0
-  if (auto *CmpI = dyn_cast<CmpInst>(V)) {
-    if (CmpI->isEquivalence()) {
-      Value *CmpLHS = CmpI->getOperand(0);
-      Value *CmpRHS = CmpI->getOperand(1);
-      // Heuristically pick the better replacement -- the choice of heuristic
-      // isn't terribly important here, but the fact we canonicalize on some
-      // replacement is for exposing other simplifications.
-      // TODO: pull this out as a helper function and reuse w/ existing
-      // (slightly different) logic.
-      if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
-        std::swap(CmpLHS, CmpRHS);
-      if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
-        std::swap(CmpLHS, CmpRHS);
-      if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
-          (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
-        // Move the 'oldest' value to the right-hand side, using the value
-        // number as a proxy for age.
-        uint32_t LVN = VN.lookupOrAdd(CmpLHS);
-        uint32_t RVN = VN.lookupOrAdd(CmpRHS);
-        if (LVN < RVN)
-          std::swap(CmpLHS, CmpRHS);
-      }
-
-      // Handle degenerate case where we either haven't pruned a dead path or a
-      // removed a trivial assume yet.
-      if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
-        return Changed;
-
-      LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
-                 << *CmpLHS << " with "
-                 << *CmpRHS << " in block "
-                 << IntrinsicI->getParent()->getName() << "\n");
-
-      // Setup the replacement map - this handles uses within the same block.
-      if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
-        ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
-
-      // NOTE: The non-block local cases are handled by the call to
-      // propagateEquality above; this block is just about handling the block
-      // local cases.  TODO: There's a bunch of logic in propagateEqualiy which
-      // isn't duplicated for the block local case, can we share it somehow?
-    }
-  }
-  return Changed;
+  return propagateEquality(V, True, IntrinsicI);
 }
 
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
@@ -2526,39 +2437,28 @@ void GVNPass::assignBlockRPONumber(Function &F) {
   InvalidBlockRPONumbers = false;
 }
 
-bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const {
-  bool Changed = false;
-  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
-    Use &Operand = Instr->getOperandUse(OpNum);
-    auto It = ReplaceOperandsWithMap.find(Operand.get());
-    if (It != ReplaceOperandsWithMap.end()) {
-      const DataLayout &DL = Instr->getDataLayout();
-      if (!canReplacePointersInUseIfEqual(Operand, It->second, DL))
-        continue;
-
-      LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
-                        << *It->second << " in instruction " << *Instr << '\n');
-      Instr->setOperand(OpNum, It->second);
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-/// The given values are known to be equal in every block
+/// The given values are known to be equal in every use
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
-/// If DominatesByEdge is false, then it means that we will propagate the RHS
-/// value starting from the end of Root.Start.
-bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
-                                const BasicBlockEdge &Root,
-                                bool DominatesByEdge) {
+/// The Root may either be a basic block edge (for conditions) or an
+/// instruction (for assumes).
+bool GVNPass::propagateEquality(
+    Value *LHS, Value *RHS,
+    const std::variant<BasicBlockEdge, Instruction *> &Root) {
   SmallVector<std::pair<Value*, Value*>, 4> Worklist;
   Worklist.push_back(std::make_pair(LHS, RHS));
   bool Changed = false;
-  // For speed, compute a conservative fast approximation to
-  // DT->dominates(Root, Root.getEnd());
-  const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+  SmallVector<const BasicBlock *> DominatedBlocks;
+  if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) {
+    // For speed, compute a conservative fast approximation to
+    // DT->dominates(Root, Root.getEnd());
+    if (isOnlyReachableViaThisEdge(*Edge, DT))
+      DominatedBlocks.push_back(Edge->getEnd());
+  } else {
+    Instruction *I = std::get<Instruction *>(Root);
+    for (const auto *Node : DT->getNode(I->getParent())->children())
+      DominatedBlocks.push_back(Node->getBlock());
+  }
 
   while (!Worklist.empty()) {
     std::pair<Value*, Value*> Item = Worklist.pop_back_val();
@@ -2606,9 +2506,9 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
     // using the leader table is about compiling faster, not optimizing better).
     // The leader table only tracks basic blocks, not edges. Only add to if we
     // have the simple case where the edge dominates the end.
-    if (RootDominatesEnd && !isa<Instruction>(RHS) &&
-        canReplacePointersIfEqual(LHS, RHS, DL))
-      LeaderTable.insert(LVN, RHS, Root.getEnd());
+    if (!isa<Instruction>(RHS) && canReplacePointersIfEqual(LHS, RHS, DL))
+      for (const BasicBlock *BB : DominatedBlocks)
+        LeaderTable.insert(LVN, RHS, BB);
 
     // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
     // LHS always has at least one use that is not dominated by Root, this will
@@ -2618,12 +2518,14 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
       auto CanReplacePointersCallBack = [&DL](const Use &U, const Value *To) {
         return canReplacePointersInUseIfEqual(U, To, DL);
       };
-      unsigned NumReplacements =
-          DominatesByEdge
-              ? replaceDominatedUsesWithIf(LHS, RHS, *DT, Root,
-                                           CanReplacePointersCallBack)
-              : replaceDominatedUsesWithIf(LHS, RHS, *DT, Root.getStart(),
-                                           CanReplacePointersCallBack);
+      unsigned NumReplacements;
+      if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root))
+        NumReplacements = replaceDominatedUsesWithIf(
+            LHS, RHS, *DT, *Edge, CanReplacePointersCallBack);
+      else
+        NumReplacements = replaceDominatedUsesWithIf(
+            LHS, RHS, *DT, std::get<Instruction *>(Root),
+            CanReplacePointersCallBack);
 
       if (NumReplacements > 0) {
         Changed = true;
@@ -2682,26 +2584,45 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
       // If the number we were assigned was brand new then there is no point in
       // looking for an instruction realizing it: there cannot be one!
       if (Num < NextNum) {
-        Value *NotCmp = findLeader(Root.getEnd(), Num);
-        if (NotCmp && isa<Instruction>(NotCmp)) {
-          unsigned NumReplacements =
-              DominatesByEdge
-                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
-                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
-                                             Root.getStart());
-          Changed |= NumReplacements > 0;
-          NumGVNEqProp += NumReplacements;
-          // Cached information for anything that uses NotCmp will be invalid.
-          if (MD)
-            MD->invalidateCachedPointerInfo(NotCmp);
+        for (const auto &Entry : LeaderTable.getLeaders(Num)) {
+          // Only look at leaders that either dominate the start of the edge,
+          // or are dominated by the end. This check is not necessary for
+          // correctness, it only discards cases for which the following
+          // use replacement will not work anyway.
+          if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) {
+            if (!DT->dominates(Entry.BB, Edge->getStart()) &&
+                !DT->dominates(Edge->getEnd(), Entry.BB))
+              continue;
+          } else {
+            auto *InstBB = std::get<Instruction *>(Root)->getParent();
+            if (!DT->dominates(Entry.BB, InstBB) &&
+                !DT->dominates(InstBB, Entry.BB))
+              continue;
+          }
+
+          Value *NotCmp = Entry.Val;
+          if (NotCmp && isa<Instruction>(NotCmp)) {
+            unsigned NumReplacements;
+            if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root))
+              NumReplacements =
+                  replaceDominatedUsesWith(NotCmp, NotVal, *DT, *Edge);
+            else
+              NumReplacements = replaceDominatedUsesWith(
+                  NotCmp, NotVal, *DT, std::get<Instruction *>(Root));
+            Changed |= NumReplacements > 0;
+            NumGVNEqProp += NumReplacements;
+            // Cached information for anything that uses NotCmp will be invalid.
+            if (MD)
+              MD->invalidateCachedPointerInfo(NotCmp);
+          }
         }
       }
       // Ensure that any instruction in scope that gets the "A < B" value number
       // is replaced with false.
       // The leader table only tracks basic blocks, not edges. Only add to if we
       // have the simple case where the edge dominates the end.
-      if (RootDominatesEnd)
-        LeaderTable.insert(Num, NotVal, Root.getEnd());
+      for (const BasicBlock *BB : DominatedBlocks)
+        LeaderTable.insert(Num, NotVal, BB);
 
       continue;
     }
@@ -2789,11 +2710,11 @@ bool GVNPass::processInstruction(Instruction *I) {
 
     Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
     BasicBlockEdge TrueE(Parent, TrueSucc);
-    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE);
 
     Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
     BasicBlockEdge FalseE(Parent, FalseSucc);
-    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE);
 
     return Changed;
   }
@@ -2814,7 +2735,7 @@ bool GVNPass::processInstruction(Instruction *I) {
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E, true);
+        Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E);
       }
     }
     return Changed;
@@ -2942,8 +2863,6 @@ bool GVNPass::processBlock(BasicBlock *BB) {
   if (DeadBlocks.count(BB))
     return false;
 
-  // Clearing map before every BB because it can be used only for single BB.
-  ReplaceOperandsWithMap.clear();
   bool ChangedFunction = false;
 
   // Since we may not have visited the input blocks of the phis, we can't
@@ -2955,11 +2874,8 @@ bool GVNPass::processBlock(BasicBlock *BB) {
   for (PHINode *PN : PHINodesToRemove) {
     removeInstruction(PN);
   }
-  for (Instruction &Inst : make_early_inc_range(*BB)) {
-    if (!ReplaceOperandsWithMap.empty())
-      ChangedFunction |= replaceOperandsForInBlockEquality(&Inst);
+  for (Instruction &Inst : make_early_inc_range(*BB))
     ChangedFunction |= processInstruction(&Inst);
-  }
   return ChangedFunction;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 3c1a8ba..80aa98d 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -434,10 +434,6 @@ private:
   int StoreCount = 0;
 };
 
-} // end anonymous namespace
-
-namespace llvm {
-
 struct ExactEqualsExpression {
   const Expression &E;
 
@@ -449,8 +445,9 @@ struct ExactEqualsExpression {
     return E.exactlyEquals(Other);
   }
 };
+} // end anonymous namespace
 
-template <> struct DenseMapInfo<const Expression *> {
+template <> struct llvm::DenseMapInfo<const Expression *> {
   static const Expression *getEmptyKey() {
     auto Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
@@ -493,8 +490,6 @@ template <> struct DenseMapInfo<const Expression *> {
   }
 };
 
-} // end namespace llvm
-
 namespace {
 
 class NewGVN {
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 2190dcd..a87822c 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -84,10 +84,6 @@ public:
   bool run();
 };
 
-} // anonymous namespace
-
-namespace llvm {
-
 struct FrozenIndPHIInfo {
   // A freeze instruction that uses an induction phi
   FreezeInst *FI = nullptr;
@@ -103,7 +99,9 @@ struct FrozenIndPHIInfo {
   bool operator==(const FrozenIndPHIInfo &Other) { return FI == Other.FI; }
 };
 
-template <> struct DenseMapInfo<FrozenIndPHIInfo> {
+} // namespace
+
+template <> struct llvm::DenseMapInfo<FrozenIndPHIInfo> {
   static inline FrozenIndPHIInfo getEmptyKey() {
     return FrozenIndPHIInfo(DenseMapInfo<PHINode *>::getEmptyKey(),
                             DenseMapInfo<BinaryOperator *>::getEmptyKey());
@@ -124,8 +122,6 @@ template <> struct DenseMapInfo<FrozenIndPHIInfo> {
   };
 };
 
-} // end namespace llvm
-
 // Given U = (value, user), replace value with freeze(value), and let
 // SCEV forget user. The inserted freeze is placed in the preheader.
 void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index b6ca52e..46f2903 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3246,6 +3246,13 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
   return ::replaceDominatedUsesWith(From, To, Dominates);
 }
 
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const Instruction *I) {
+  auto Dominates = [&](const Use &U) { return DT.dominates(I, U); };
+  return ::replaceDominatedUsesWith(From, To, Dominates);
+}
+
 unsigned llvm::replaceDominatedUsesWithIf(
     Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root,
     function_ref<bool(const Use &U, const Value *To)> ShouldReplace) {
@@ -3264,6 +3271,15 @@ unsigned llvm::replaceDominatedUsesWithIf(
   return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace);
 }
 
+unsigned llvm::replaceDominatedUsesWithIf(
+    Value *From, Value *To, DominatorTree &DT, const Instruction *I,
+    function_ref<bool(const Use &U, const Value *To)> ShouldReplace) {
+  auto DominatesAndShouldReplace = [&](const Use &U) {
+    return DT.dominates(I, U) && ShouldReplace(U, To);
+  };
+  return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace);
+}
+
 bool llvm::callsGCLeafFunction(const CallBase *Call,
                                const TargetLibraryInfo &TLI) {
   // Check if the function is specifically marked as a gc leaf function.
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index ff2ab3c..cecb662 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -27,15 +27,15 @@ using namespace llvm;
 STATISTIC(NumInvokes, "Number of invokes replaced");
 
 namespace {
-  class LowerInvokeLegacyPass : public FunctionPass {
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
-      initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnFunction(Function &F) override;
-  };
-}
+class LowerInvokeLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+    initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
 
 char LowerInvokeLegacyPass::ID = 0;
 INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
@@ -78,11 +78,12 @@ bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
   return runImpl(F);
 }
 
-namespace llvm {
-char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+char &llvm::LowerInvokePassID = LowerInvokeLegacyPass::ID;
 
 // Public Interface To the LowerInvoke pass.
-FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+FunctionPass *llvm::createLowerInvokePass() {
+  return new LowerInvokeLegacyPass();
+}
 
 PreservedAnalyses LowerInvokePass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
@@ -92,4 +93,3 @@ PreservedAnalyses LowerInvokePass::run(Function &F,
 
   return PreservedAnalyses::none();
 }
-}
diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp
index ca7e09d..1585e9e 100644
--- a/llvm/lib/Transforms/Utils/MisExpect.cpp
+++ b/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -48,8 +48,6 @@
 using namespace llvm;
 using namespace misexpect;
 
-namespace llvm {
-
 // Command line option to enable/disable the warning when profile data suggests
 // a mismatch with the use of the llvm.expect intrinsic
 static cl::opt<bool> PGOWarnMisExpect(
@@ -63,22 +61,18 @@ static cl::opt<uint32_t> MisExpectTolerance(
     cl::desc("Prevents emitting diagnostics when profile counts are "
              "within N% of the threshold.."));
 
-} // namespace llvm
-
-namespace {
-
-bool isMisExpectDiagEnabled(LLVMContext &Ctx) {
+static bool isMisExpectDiagEnabled(const LLVMContext &Ctx) {
   return PGOWarnMisExpect || Ctx.getMisExpectWarningRequested();
 }
 
-uint32_t getMisExpectTolerance(LLVMContext &Ctx) {
+static uint32_t getMisExpectTolerance(const LLVMContext &Ctx) {
   return std::max(static_cast<uint32_t>(MisExpectTolerance),
                   Ctx.getDiagnosticsMisExpectTolerance());
 }
 
-Instruction *getInstCondition(Instruction *I) {
+static const Instruction *getInstCondition(const Instruction *I) {
   assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
-  Instruction *Ret = nullptr;
+  const Instruction *Ret = nullptr;
   if (auto *B = dyn_cast<BranchInst>(I)) {
     Ret = dyn_cast<Instruction>(B->getCondition());
   }
@@ -97,8 +91,8 @@ Instruction *getInstCondition(Instruction *I) {
   return Ret ? Ret : I;
 }
 
-void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
-                             uint64_t ProfCount, uint64_t TotalCount) {
+static void emitMisexpectDiagnostic(const Instruction *I, LLVMContext &Ctx,
+                                    uint64_t ProfCount, uint64_t TotalCount) {
   double PercentageCorrect = (double)ProfCount / TotalCount;
   auto PerString =
       formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
@@ -106,20 +100,16 @@ void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
       "Potential performance regression from use of the llvm.expect intrinsic: "
       "Annotation was correct on {0} of profiled executions.",
       PerString);
-  Instruction *Cond = getInstCondition(I);
+  const Instruction *Cond = getInstCondition(I);
   if (isMisExpectDiagEnabled(Ctx))
     Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Twine(PerString)));
   OptimizationRemarkEmitter ORE(I->getParent()->getParent());
   ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
 }
 
-} // namespace
-
-namespace llvm {
-namespace misexpect {
-
-void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
-                     ArrayRef<uint32_t> ExpectedWeights) {
+void misexpect::verifyMisExpect(const Instruction &I,
+                                ArrayRef<uint32_t> RealWeights,
+                                ArrayRef<uint32_t> ExpectedWeights) {
   // To determine if we emit a diagnostic, we need to compare the branch weights
   // from the profile to those added by the llvm.expect intrinsic.
   // So first, we extract the "likely" and "unlikely" weights from
@@ -128,15 +118,13 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
   uint64_t LikelyBranchWeight = 0,
            UnlikelyBranchWeight = std::numeric_limits<uint32_t>::max();
   size_t MaxIndex = 0;
-  for (size_t Idx = 0, End = ExpectedWeights.size(); Idx < End; Idx++) {
-    uint32_t V = ExpectedWeights[Idx];
+  for (const auto &[Idx, V] : enumerate(ExpectedWeights)) {
     if (LikelyBranchWeight < V) {
       LikelyBranchWeight = V;
       MaxIndex = Idx;
     }
-    if (UnlikelyBranchWeight > V) {
+    if (UnlikelyBranchWeight > V)
       UnlikelyBranchWeight = V;
-    }
   }
 
   const uint64_t ProfiledWeight = RealWeights[MaxIndex];
@@ -161,7 +149,7 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
   uint64_t ScaledThreshold = LikelyProbablilty.scale(RealWeightsTotal);
 
   // clamp tolerance range to [0, 100)
-  auto Tolerance = getMisExpectTolerance(I.getContext());
+  uint32_t Tolerance = getMisExpectTolerance(I.getContext());
   Tolerance = std::clamp(Tolerance, 0u, 99u);
 
   // Allow users to relax checking by N%  i.e., if they use a 5% tolerance,
@@ -175,8 +163,8 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
                             RealWeightsTotal);
 }
 
-void checkBackendInstrumentation(Instruction &I,
-                                 const ArrayRef<uint32_t> RealWeights) {
+void misexpect::checkBackendInstrumentation(const Instruction &I,
+                                            ArrayRef<uint32_t> RealWeights) {
   // Backend checking assumes any existing weight comes from an `llvm.expect`
   // intrinsic. However, SampleProfiling + ThinLTO add branch weights  multiple
   // times, leading to an invalid assumption in our checking. Backend checks
@@ -190,24 +178,19 @@ void checkBackendInstrumentation(Instruction &I,
   verifyMisExpect(I, RealWeights, ExpectedWeights);
 }
 
-void checkFrontendInstrumentation(Instruction &I,
-                                  const ArrayRef<uint32_t> ExpectedWeights) {
+void misexpect::checkFrontendInstrumentation(
+    const Instruction &I, ArrayRef<uint32_t> ExpectedWeights) {
   SmallVector<uint32_t> RealWeights;
   if (!extractBranchWeights(I, RealWeights))
     return;
   verifyMisExpect(I, RealWeights, ExpectedWeights);
 }
 
-void checkExpectAnnotations(Instruction &I,
-                            const ArrayRef<uint32_t> ExistingWeights,
-                            bool IsFrontend) {
-  if (IsFrontend) {
+void misexpect::checkExpectAnnotations(const Instruction &I,
+                                       ArrayRef<uint32_t> ExistingWeights,
+                                       bool IsFrontend) {
+  if (IsFrontend)
     checkFrontendInstrumentation(I, ExistingWeights);
-  } else {
+  else
     checkBackendInstrumentation(I, ExistingWeights);
-  }
 }
-
-} // namespace misexpect
-} // namespace llvm
-#undef DEBUG_TYPE
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 155fcc5..d831c27 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
     unsigned PreviousEdges = OtherCases->size();
     if (OtherDest == SI->getDefaultDest())
       ++PreviousEdges;
-    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+    unsigned E = PreviousEdges - 1;
+    // Remove all incoming values from OtherDest if OtherDest is unreachable.
+    if (NewBI->isUnconditional())
+      ++E;
+    for (unsigned I = 0; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
 
@@ -7736,8 +7740,7 @@ struct SwitchSuccWrapper {
   DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> *PhiPredIVs;
 };
 
-namespace llvm {
-template <> struct DenseMapInfo<const SwitchSuccWrapper *> {
+template <> struct llvm::DenseMapInfo<const SwitchSuccWrapper *> {
   static const SwitchSuccWrapper *getEmptyKey() {
     return static_cast<SwitchSuccWrapper *>(
         DenseMapInfo<void *>::getEmptyKey());
@@ -7805,7 +7808,6 @@ template <> struct DenseMapInfo<const SwitchSuccWrapper *> {
     return true;
   }
 };
-} // namespace llvm
 
 bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
                                                  DomTreeUpdater *DTU) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3f16b03..e62d57e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
     Instruction *I = Worklist.pop_back_val();
     for (auto &Op : I->operands())
       if (auto *InstOp = dyn_cast<Instruction>(Op))
-        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+        if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
             AddrDefs.insert(InstOp).second)
           Worklist.push_back(InstOp);
   }