164 files changed, 2946 insertions, 1224 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 8d20b0e..805b682 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1180,32 +1180,41 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
   S = SE->getTruncateOrZeroExtend(S, MaxType);
   Size = SE->getTruncateOrZeroExtend(Size, MaxType);
 
-  // Special check for addrecs using BE taken count
-  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S))
-    if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) {
-      const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
-      const SCEV *Start = AddRec->getStart();
-      const SCEV *Step = AddRec->getStepRecurrence(*SE);
-      const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
-      const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
-      const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
-
-      // If the value of Step is non-negative and the AddRec is non-wrap, it
-      // reaches its maximum at the last iteration. So it's enouth to check
-      // whether End - Size is negative.
-      if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
-        return true;
+  auto CheckAddRecBECount = [&]() {
+    const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+    if (!AddRec || !AddRec->isAffine() || !AddRec->hasNoSignedWrap())
+      return false;
+    const SCEV *BECount = collectUpperBound(AddRec->getLoop(), MaxType);
+    // If the BTC cannot be computed, check the base case for S.
+    if (!BECount || isa<SCEVCouldNotCompute>(BECount))
+      return false;
+    const SCEV *Start = AddRec->getStart();
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
+    const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
+    const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
+
+    // If the value of Step is non-negative and the AddRec is non-wrap, it
+    // reaches its maximum at the last iteration. So it's enouth to check
+    // whether End - Size is negative.
+    if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
+      return true;
 
-      // If the value of Step is non-positive and the AddRec is non-wrap, the
-      // initial value is its maximum.
-      if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
-        return true;
+    // If the value of Step is non-positive and the AddRec is non-wrap, the
+    // initial value is its maximum.
+    if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
+      return true;
 
-      // Even if we don't know the sign of Step, either Start or End must be
-      // the maximum value of the AddRec since it is non-wrap.
-      if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
-        return true;
-    }
+    // Even if we don't know the sign of Step, either Start or End must be
+    // the maximum value of the AddRec since it is non-wrap.
+    if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
+      return true;
+
+    return false;
+  };
+
+  if (CheckAddRecBECount())
+    return true;
 
   // Check using normal isKnownNegative
   const SCEV *LimitedBound = SE->getMinusSCEV(S, Size);
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540c..9f8ac6e 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -849,17 +849,12 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
 /// %sum.2 = select %cmp, %add, %sum.1
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
-  SelectInst *SI = dyn_cast<SelectInst>(I);
-  if (!SI)
-    return InstDesc(false, I);
-
-  CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
+  Value *TrueVal, *FalseVal;
   // Only handle single use cases for now.
-  if (!CI || !CI->hasOneUse())
+  if (!match(I,
+             m_Select(m_OneUse(m_Cmp()), m_Value(TrueVal), m_Value(FalseVal))))
     return InstDesc(false, I);
 
-  Value *TrueVal = SI->getTrueValue();
-  Value *FalseVal = SI->getFalseValue();
   // Handle only when either of operands of select instruction is a PHI
   // node for now.
   if ((isa<PHINode>(TrueVal) && isa<PHINode>(FalseVal)) ||
@@ -886,7 +881,7 @@ RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
   if (!IPhi || IPhi != FalseVal)
     return InstDesc(false, I);
 
-  return InstDesc(true, SI);
+  return InstDesc(true, I);
 }
 
 RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4e38626..e08ef60 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6644,7 +6644,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
            "Invalid mask width");
     // If index-width (mask size) is less than pointer-size then mask is
     // 1-extended.
-    if (match(Op1, m_PtrToInt(m_Specific(Op0))))
+    if (match(Op1, m_PtrToIntOrAddr(m_Specific(Op0))))
       return Op0;
 
     // NOTE: We may have attributes associated with the return value of the
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index ab37338..0b2e3fc 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -393,7 +393,7 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
 /// \param AA        The AliasAnalysis we used for our search.
 /// \param AllowImpreciseClobber Always false, unless we do relaxed verify.
 
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
 checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
                    const MemoryLocation &StartLoc, const MemorySSA &MSSA,
                    const UpwardsMemoryQuery &Query, BatchAAResults &AA,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5b4cd9..a64b93d 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1774,7 +1774,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
   {
     const SCEV *LHS;
     const SCEV *RHS;
-    if (matchURem(Op, LHS, RHS))
+    if (match(Op, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), *this)))
       return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1),
                          getZeroExtendExpr(RHS, Ty, Depth + 1));
   }
@@ -2699,17 +2699,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   }
 
   // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
-  if (Ops.size() == 2) {
-    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]);
-    if (Mul && Mul->getNumOperands() == 2 &&
-        Mul->getOperand(0)->isAllOnesValue()) {
-      const SCEV *X;
-      const SCEV *Y;
-      if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) {
-        return getMulExpr(Y, getUDivExpr(X, Y));
-      }
-    }
-  }
+  const SCEV *Y;
+  if (Ops.size() == 2 &&
+      match(Ops[0],
+            m_scev_Mul(m_scev_AllOnes(),
+                       m_scev_URem(m_scev_Specific(Ops[1]), m_SCEV(Y), *this))))
+    return getMulExpr(Y, getUDivExpr(Ops[1], Y));
 
   // Skip past any other cast SCEVs.
   while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
@@ -5419,20 +5414,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
   if (SourceBits != NewBits)
     return nullptr;
 
-  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
-  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
-  if (!SExt && !ZExt)
-    return nullptr;
-  const SCEVTruncateExpr *Trunc =
-      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
-           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
-  if (!Trunc)
-    return nullptr;
-  const SCEV *X = Trunc->getOperand();
-  if (X != SymbolicPHI)
-    return nullptr;
-  Signed = SExt != nullptr;
-  return Trunc->getType();
+  if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = true;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = false;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  return nullptr;
 }
 
 static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -6427,8 +6417,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S,
   case scSequentialUMinExpr:
     return GetGCDMultiple(cast<SCEVNAryExpr>(S));
   case scUnknown: {
-    // ask ValueTracking for known bits
+    // Ask ValueTracking for known bits. SCEVUnknown only become available at
+    // the point their underlying IR instruction has been defined. If CtxI was
+    // not provided, use:
+    // * the first instruction in the entry block if it is an argument
+    // * the instruction itself otherwise.
     const SCEVUnknown *U = cast<SCEVUnknown>(S);
+    if (!CtxI) {
+      if (isa<Argument>(U->getValue()))
+        CtxI = &*F.getEntryBlock().begin();
+      else if (auto *I = dyn_cast<Instruction>(U->getValue()))
+        CtxI = I;
+    }
     unsigned Known =
         computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT)
             .countMinTrailingZeros();
@@ -15415,67 +15415,6 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
     }
 }
 
-// Match the mathematical pattern A - (A / B) * B, where A and B can be
-// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
-// for URem with constant power-of-2 second operands.
-// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
-// 4, A / B becomes X / 8).
-bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
-                                const SCEV *&RHS) {
-  if (Expr->getType()->isPointerTy())
-    return false;
-
-  // Try to match 'zext (trunc A to iB) to iY', which is used
-  // for URem with constant power-of-2 second operands. Make sure the size of
-  // the operand A matches the size of the whole expressions.
-  if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
-    if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
-      LHS = Trunc->getOperand();
-      // Bail out if the type of the LHS is larger than the type of the
-      // expression for now.
-      if (getTypeSizeInBits(LHS->getType()) >
-          getTypeSizeInBits(Expr->getType()))
-        return false;
-      if (LHS->getType() != Expr->getType())
-        LHS = getZeroExtendExpr(LHS, Expr->getType());
-      RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
-                        << getTypeSizeInBits(Trunc->getType()));
-      return true;
-    }
-  const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (Add == nullptr || Add->getNumOperands() != 2)
-    return false;
-
-  const SCEV *A = Add->getOperand(1);
-  const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
-
-  if (Mul == nullptr)
-    return false;
-
-  const auto MatchURemWithDivisor = [&](const SCEV *B) {
-    // (SomeExpr + (-(SomeExpr / B) * B)).
-    if (Expr == getURemExpr(A, B)) {
-      LHS = A;
-      RHS = B;
-      return true;
-    }
-    return false;
-  };
-
-  // (SomeExpr + (-1 * (SomeExpr / B) * B)).
-  if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
-    return MatchURemWithDivisor(Mul->getOperand(1)) ||
-           MatchURemWithDivisor(Mul->getOperand(2));
-
-  // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
-  if (Mul->getNumOperands() == 2)
-    return MatchURemWithDivisor(Mul->getOperand(1)) ||
-           MatchURemWithDivisor(Mul->getOperand(0)) ||
-           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) ||
-           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
-  return false;
-}
-
 ScalarEvolution::LoopGuards
 ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   BasicBlock *Header = L->getHeader();
@@ -15696,20 +15635,18 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
       // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
       // explicitly express that.
-      const SCEV *URemLHS = nullptr;
+      const SCEVUnknown *URemLHS = nullptr;
       const SCEV *URemRHS = nullptr;
-      if (SE.matchURem(LHS, URemLHS, URemRHS)) {
-        if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
-          auto I = RewriteMap.find(LHSUnknown);
-          const SCEV *RewrittenLHS =
-              I != RewriteMap.end() ? I->second : LHSUnknown;
-          RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
-          const auto *Multiple =
-              SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
-          RewriteMap[LHSUnknown] = Multiple;
-          ExprsToRewrite.push_back(LHSUnknown);
-          return;
-        }
+      if (match(LHS,
+                m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
+        auto I = RewriteMap.find(URemLHS);
+        const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
+        RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
+        const auto *Multiple =
+            SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
+        RewriteMap[URemLHS] = Multiple;
+        ExprsToRewrite.push_back(URemLHS);
+        return;
       }
     }
 
@@ -15834,6 +15771,21 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
           const SCEV *OneAlignedUp =
               GetNextSCEVDividesByDivisor(One, DividesBy);
           To = SE.getUMaxExpr(FromRewritten, OneAlignedUp);
+        } else {
+          if (LHS->getType()->isPointerTy()) {
+            LHS = SE.getLosslessPtrToIntExpr(LHS);
+            RHS = SE.getLosslessPtrToIntExpr(RHS);
+            if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS))
+              break;
+          }
+          auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) {
+            const SCEV *Sub = SE.getMinusSCEV(A, B);
+            AddRewrite(Sub, Sub,
+                       SE.getUMaxExpr(Sub, SE.getOne(From->getType())));
+          };
+          AddSubRewrite(LHS, RHS);
+          AddSubRewrite(RHS, LHS);
+          continue;
         }
         break;
       default:
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index b036b2d..1f751ee 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -6,6 +6,46 @@
 #include "llvm/ProfileData/InstrProf.h"
 
 using namespace llvm;
+
+namespace llvm {
+namespace memprof {
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+  if (GVar.hasSection())
+    return true;
+
+  auto Attrs = GVar.getAttributes();
+  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+      Attrs.hasAttribute("relro-section") ||
+      Attrs.hasAttribute("rodata-section"))
+    return true;
+  return false;
+}
+
+AnnotationKind getAnnotationKind(const GlobalVariable &GV) {
+  if (GV.isDeclarationForLinker())
+    return AnnotationKind::DeclForLinker;
+  // Skip 'llvm.'-prefixed global variables conservatively because they are
+  // often handled specially,
+  StringRef Name = GV.getName();
+  if (Name.starts_with("llvm."))
+    return AnnotationKind::ReservedName;
+  // Respect user-specified custom data sections.
+  if (hasExplicitSectionName(GV))
+    return AnnotationKind::ExplicitSection;
+  return AnnotationKind::AnnotationOK;
+}
+
+bool IsAnnotationOK(const GlobalVariable &GV) {
+  return getAnnotationKind(GV) == AnnotationKind::AnnotationOK;
+}
+} // namespace memprof
+} // namespace llvm
+
 void StaticDataProfileInfo::addConstantProfileCount(
     const Constant *C, std::optional<uint64_t> Count) {
   if (!Count) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 6356d71..873ac8f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
-namespace llvm {
+using namespace llvm;
 
 AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
 
@@ -90,5 +90,3 @@ void AIXException::endFunction(const MachineFunction *MF) {
 
   emitExceptionInfoTable(LSDALabel, PerSym);
 }
-
-} // End of namespace llvm
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 219bbc9..05fffe9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1437,7 +1437,8 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
           BrProbEnabled,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
           static_cast<bool>(BBAddrMapSkipEmitBBEntries),
-          HasCalls};
+          HasCalls,
+          false};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 260ce8f..93ae548 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -85,8 +85,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
 
 using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
 
-namespace std {
-template <> struct hash<VarLocInsertPt> {
+template <> struct std::hash<VarLocInsertPt> {
   using argument_type = VarLocInsertPt;
   using result_type = std::size_t;
 
@@ -94,7 +93,6 @@ template <> struct hash<VarLocInsertPt> {
     return std::hash<void *>()(Arg.getOpaqueValue());
   }
 };
-} // namespace std
 
 /// Helper class to build FunctionVarLocs, since that class isn't easy to
 /// modify. TODO: There's not a great deal of value in the split, it could be
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 4931403..53f1cfe2 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -770,7 +770,7 @@ struct PartwordMaskValues {
   Value *Inv_Mask = nullptr;
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
   auto PrintObj = [&O](auto *V) {
     if (V)
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index fd7df6b..47b7a88 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -207,9 +207,7 @@ bool ApplyCloning(MachineFunction &MF,
   }
   return AnyPathsCloned;
 }
-} // end anonymous namespace
 
-namespace llvm {
 class BasicBlockPathCloning : public MachineFunctionPass {
 public:
   static char ID;
@@ -229,7 +227,7 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
-} // namespace llvm
+} // namespace
 
 char BasicBlockPathCloning::ID = 0;
 INITIALIZE_PASS_BEGIN(
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 28e6728..1846880 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -31,7 +31,7 @@
 
 using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class BreakFalseDeps : public MachineFunctionPass {
 private:
@@ -95,7 +95,7 @@ private:
   void processUndefReads(MachineBasicBlock *);
 };
 
-} // namespace llvm
+} // namespace
 
 #define DEBUG_TYPE "break-false-deps"
 
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 6c2a5a7..87ada87 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -126,8 +126,7 @@ hash_code hash_value(const ComplexValue &Arg) {
 } // end namespace
 typedef SmallVector<struct ComplexValue, 2> ComplexValues;
 
-namespace llvm {
-template <> struct DenseMapInfo<ComplexValue> {
+template <> struct llvm::DenseMapInfo<ComplexValue> {
   static inline ComplexValue getEmptyKey() {
     return {DenseMapInfo<Value *>::getEmptyKey(),
             DenseMapInfo<Value *>::getEmptyKey()};
@@ -144,7 +143,6 @@ template <> struct DenseMapInfo<ComplexValue> {
     return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
   }
 };
-} // end namespace llvm
 
 namespace {
 template <typename T, typename IterT>
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index f4335396..50dd66f 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -81,13 +81,10 @@ void EdgeBundles::init() {
   }
 }
 
-namespace llvm {
-
 /// Specialize WriteGraph, the standard implementation won't work.
-template<>
-raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
-                          bool ShortNames,
-                          const Twine &Title) {
+template <>
+raw_ostream &llvm::WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                                bool ShortNames, const Twine &Title) {
   const MachineFunction *MF = G.getMachineFunction();
 
   O << "digraph {\n";
@@ -107,8 +104,6 @@ raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
   return O;
 }
 
-} // end namespace llvm
-
 /// view - Visualize the annotated bipartite CFG with Graphviz.
 void EdgeBundles::view() const {
   ViewGraph(*this, "EdgeBundles");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 9cc6c6a..04c7008 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -82,7 +82,7 @@ public:
   }
 
   static FRemExpander create(IRBuilder<> &B, Type *Ty) {
-    assert(canExpandType(Ty));
+    assert(canExpandType(Ty) && "Expected supported floating point type");
 
     // The type to use for the computation of the remainder. This may be
     // wider than the input/result type which affects the ...
@@ -356,8 +356,9 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y,
 static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
   LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
 
-  Type *ReturnTy = I.getType();
-  assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+  Type *Ty = I.getType();
+  assert(FRemExpander::canExpandType(Ty) &&
+         "Expected supported floating point type");
 
   FastMathFlags FMF = I.getFastMathFlags();
   // TODO Make use of those flags for optimization?
@@ -368,32 +369,10 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
   B.setFastMathFlags(FMF);
   B.SetCurrentDebugLocation(I.getDebugLoc());
 
-  Type *ElemTy = ReturnTy->getScalarType();
-  const FRemExpander Expander = FRemExpander::create(B, ElemTy);
-
-  Value *Ret;
-  if (ReturnTy->isFloatingPointTy())
-    Ret = FMF.approxFunc()
-              ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
-              : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
-  else {
-    auto *VecTy = cast<FixedVectorType>(ReturnTy);
-
-    // This could use SplitBlockAndInsertForEachLane but the interface
-    // is a bit awkward for a constant number of elements and it will
-    // boil down to the same code.
-    // TODO Expand the FRem instruction only once and reuse the code.
-    Value *Nums = I.getOperand(0);
-    Value *Denums = I.getOperand(1);
-    Ret = PoisonValue::get(I.getType());
-    for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
-      Value *Num = B.CreateExtractElement(Nums, I);
-      Value *Denum = B.CreateExtractElement(Denums, I);
-      Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
-                                    : Expander.buildFRem(Num, Denum, SQ);
-      Ret = B.CreateInsertElement(Ret, Rem, I);
-    }
-  }
+  const FRemExpander Expander = FRemExpander::create(B, Ty);
+  Value *Ret = FMF.approxFunc()
+                   ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+                   : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
 
   I.replaceAllUsesWith(Ret);
   Ret->takeName(&I);
@@ -939,7 +918,8 @@ static void expandIToFP(Instruction *IToFP) {
   IToFP->eraseFromParent();
 }
 
-static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
+static void scalarize(Instruction *I,
+                      SmallVectorImpl<Instruction *> &Worklist) {
   VectorType *VTy = cast<FixedVectorType>(I->getType());
 
   IRBuilder<> Builder(I);
@@ -948,12 +928,25 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
   Value *Result = PoisonValue::get(VTy);
   for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
     Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx);
-    Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
-                                     I->getType()->getScalarType());
-    Result = Builder.CreateInsertElement(Result, Cast, Idx);
-    if (isa<Instruction>(Cast))
-      Replace.push_back(cast<Instruction>(Cast));
+
+    Value *NewOp = nullptr;
+    if (auto *BinOp = dyn_cast<BinaryOperator>(I))
+      NewOp = Builder.CreateBinOp(
+          BinOp->getOpcode(), Ext,
+          Builder.CreateExtractElement(I->getOperand(1), Idx));
+    else if (auto *CastI = dyn_cast<CastInst>(I))
+      NewOp = Builder.CreateCast(CastI->getOpcode(), Ext,
+                                 I->getType()->getScalarType());
+    else
+      llvm_unreachable("Unsupported instruction type");
+
+    Result = Builder.CreateInsertElement(Result, NewOp, Idx);
+    if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) {
+      ScalarizedI->copyIRFlags(I, true);
+      Worklist.push_back(ScalarizedI);
+    }
   }
+
   I->replaceAllUsesWith(Result);
   I->dropAllReferences();
   I->eraseFromParent();
@@ -989,10 +982,17 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
   return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
 }
 
+static void addToWorklist(Instruction &I,
+                          SmallVector<Instruction *, 4> &Worklist) {
+  if (I.getOperand(0)->getType()->isVectorTy())
+    scalarize(&I, Worklist);
+  else
+    Worklist.push_back(&I);
+}
+
 static bool runImpl(Function &F, const TargetLowering &TLI,
                     AssumptionCache *AC) {
-  SmallVector<Instruction *, 4> Replace;
-  SmallVector<Instruction *, 4> ReplaceVector;
+  SmallVector<Instruction *, 4> Worklist;
   bool Modified = false;
 
   unsigned MaxLegalFpConvertBitWidth =
@@ -1003,55 +1003,39 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
   if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
     return false;
 
-  for (auto &I : instructions(F)) {
-    switch (I.getOpcode()) {
-    case Instruction::FRem: {
-      Type *Ty = I.getType();
-      // TODO: This pass doesn't handle scalable vectors.
-      if (Ty->isScalableTy())
-        continue;
-
-      if (targetSupportsFrem(TLI, Ty) ||
-          !FRemExpander::canExpandType(Ty->getScalarType()))
-        continue;
-
-      Replace.push_back(&I);
-      Modified = true;
+  for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+    Instruction &I = *It++;
+    Type *Ty = I.getType();
+    // TODO: This pass doesn't handle scalable vectors.
+    if (Ty->isScalableTy())
+      continue;
 
+    switch (I.getOpcode()) {
+    case Instruction::FRem:
+      if (!targetSupportsFrem(TLI, Ty) &&
+          FRemExpander::canExpandType(Ty->getScalarType())) {
+        addToWorklist(I, Worklist);
+        Modified = true;
+      }
       break;
-    }
     case Instruction::FPToUI:
     case Instruction::FPToSI: {
-      // TODO: This pass doesn't handle scalable vectors.
-      if (I.getOperand(0)->getType()->isScalableTy())
-        continue;
-
-      auto *IntTy = cast<IntegerType>(I.getType()->getScalarType());
+      auto *IntTy = cast<IntegerType>(Ty->getScalarType());
       if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
         continue;
 
-      if (I.getOperand(0)->getType()->isVectorTy())
-        ReplaceVector.push_back(&I);
-      else
-        Replace.push_back(&I);
+      addToWorklist(I, Worklist);
       Modified = true;
       break;
     }
     case Instruction::UIToFP:
     case Instruction::SIToFP: {
-      // TODO: This pass doesn't handle scalable vectors.
-      if (I.getOperand(0)->getType()->isScalableTy())
-        continue;
-
       auto *IntTy =
           cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
       if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
         continue;
 
-      if (I.getOperand(0)->getType()->isVectorTy())
-        ReplaceVector.push_back(&I);
-      else
-        Replace.push_back(&I);
+      addToWorklist(I, Worklist);
       Modified = true;
       break;
     }
@@ -1060,16 +1044,8 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
     }
   }
 
-  while (!ReplaceVector.empty()) {
-    Instruction *I = ReplaceVector.pop_back_val();
-    scalarize(I, Replace);
-  }
-
-  if (Replace.empty())
-    return false;
-
-  while (!Replace.empty()) {
-    Instruction *I = Replace.pop_back_val();
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
     if (I->getOpcode() == Instruction::FRem) {
       auto SQ = [&]() -> std::optional<SimplifyQuery> {
         if (AC) {
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 90c60d4..04d9309 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -112,7 +112,7 @@ APInt GISelValueTracking::getKnownOnes(Register R) {
   return getKnownBits(R).One;
 }
 
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
 dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
   dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
          << "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
@@ -1975,6 +1975,81 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
 
     break;
   }
+  case TargetOpcode::G_SUB: {
+    Register Src2 = MI.getOperand(2).getReg();
+    unsigned Src2NumSignBits =
+        computeNumSignBits(Src2, DemandedElts, Depth + 1);
+    if (Src2NumSignBits == 1)
+      return 1; // Early out.
+
+    // Handle NEG.
+    Register Src1 = MI.getOperand(1).getReg();
+    KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+    if (Known1.isZero()) {
+      KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+      // If the input is known to be 0 or 1, the output is 0/-1, which is all
+      // sign bits set.
+      if ((Known2.Zero | 1).isAllOnes())
+        return TyBits;
+
+      // If the input is known to be positive (the sign bit is known clear),
+      // the output of the NEG has, at worst, the same number of sign bits as
+      // the input.
+      if (Known2.isNonNegative()) {
+        FirstAnswer = Src2NumSignBits;
+        break;
+      }
+
+      // Otherwise, we treat this like a SUB.
+    }
+
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits == 1)
+      return 1; // Early Out.
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+    break;
+  }
+  case TargetOpcode::G_ADD: {
+    Register Src2 = MI.getOperand(2).getReg();
+    unsigned Src2NumSignBits =
+        computeNumSignBits(Src2, DemandedElts, Depth + 1);
+    if (Src2NumSignBits <= 2)
+      return 1; // Early out.
+
+    Register Src1 = MI.getOperand(1).getReg();
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits == 1)
+      return 1; // Early Out.
+
+    // Special case decrementing a value (ADD X, -1):
+    KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+    if (Known2.isAllOnes()) {
+      KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+      // If the input is known to be 0 or 1, the output is 0/-1, which is all
+      // sign bits set.
+      if ((Known1.Zero | 1).isAllOnes())
+        return TyBits;
+
+      // If we are subtracting one from a positive number, there is no carry
+      // out of the result.
+      if (Known1.isNonNegative()) {
+        FirstAnswer = Src1NumSignBits;
+        break;
+      }
+
+      // Otherwise, we treat this like an ADD.
+    }
+
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+    break;
+  }
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_ICMP: {
     bool IsFP = Opcode == TargetOpcode::G_FCMP;
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4a..81ab317 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -587,16 +587,12 @@ public:
 } // namespace
 
 char GlobalMergeFuncPassWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
-                      "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
-                    "Global merge function pass", false, false)
+INITIALIZE_PASS(GlobalMergeFuncPassWrapper, "global-merge-func",
+                "Global merge function pass", false, false)
 
-namespace llvm {
-ModulePass *createGlobalMergeFuncPass() {
+ModulePass *llvm::createGlobalMergeFuncPass() {
   return new GlobalMergeFuncPassWrapper();
 }
-} // namespace llvm
 
 GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
   initializeGlobalMergeFuncPassWrapperPass(
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 3485a27..d2f2c3e 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -101,15 +101,11 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
-namespace llvm {
-
-cl::opt<bool> UseSegmentSetForPhysRegs(
+cl::opt<bool> llvm::UseSegmentSetForPhysRegs(
     "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
     cl::desc(
         "Use segment set for the computation of the live ranges of physregs."));
 
-} // end namespace llvm
-
 void LiveIntervalsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<LiveVariablesWrapperPass>();
@@ -665,7 +661,10 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
 void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
   LiveQueryResult LRQ = LR.Query(Kill);
-  VNInfo *VNI = LRQ.valueOutOrDead();
+  // LR may have liveness reachable from early clobber slot, which may be
+  // only live-in instead of live-out of the instruction.
+  // For example, LR =[1r, 3r), Kill = 3e, we have to prune [3e, 3r) of LR.
+  VNInfo *VNI = LRQ.valueOutOrDead() ? LRQ.valueOutOrDead() : LRQ.valueIn();
   if (!VNI)
     return;
 
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index e859765..5c78d98 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -29,20 +29,17 @@ using namespace mir2vec;
 STATISTIC(MIRVocabMissCounter,
           "Number of lookups to MIR entities not present in the vocabulary");
 
-namespace llvm {
-namespace mir2vec {
-cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
+cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
     VocabFile("mir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
               cl::cat(MIR2VecCategory));
-cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
-                         cl::desc("Weight for machine opcode embeddings"),
-                         cl::cat(MIR2VecCategory));
-} // namespace mir2vec
-} // namespace llvm
+cl::opt<float>
+    llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                             cl::desc("Weight for machine opcode embeddings"),
+                             cl::cat(MIR2VecCategory));
 
 //===----------------------------------------------------------------------===//
 // Vocabulary Implementation
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index f5146f5..d988a2a 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -40,7 +40,7 @@ cl::opt<bool> ImprovedFSDiscriminator(
     "improved-fs-discriminator", cl::Hidden, cl::init(false),
     cl::desc("New FS discriminators encoding (incompatible with the original "
              "encoding)"));
-}
+} // namespace llvm
 
 char MIRAddFSDiscriminators::ID = 0;
 
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index bc65700..cbf8867 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -23,10 +23,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern char &MIRNamerID;
-} // namespace llvm
-
 #define DEBUG_TYPE "mir-namer"
 
 namespace {
@@ -53,10 +49,9 @@ public:
 
     VRegRenamer Renamer(MF.getRegInfo());
 
-    unsigned BBIndex = 0;
     ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
-    for (auto &MBB : RPOT)
-      Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+    for (const auto &[BBIndex, MBB] : enumerate(RPOT))
+      Changed |= Renamer.renameVRegs(MBB, BBIndex);
 
     return Changed;
   }
@@ -66,10 +61,4 @@ public:
 
 char MIRNamer::ID;
 
-char &llvm::MIRNamerID = MIRNamer::ID;
-
-INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                      false)
-
-INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                    false)
+INITIALIZE_PASS(MIRNamer, "mir-namer", "Rename Register Operands", false, false)
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bf8a6cd..1d54d72 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -107,10 +107,8 @@ struct MFPrintState {
 
 } // end anonymous namespace
 
-namespace llvm::yaml {
-
 /// This struct serializes the LLVM IR module.
-template <> struct BlockScalarTraits<Module> {
+template <> struct yaml::BlockScalarTraits<Module> {
   static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
     Mod.print(OS, nullptr);
   }
@@ -121,8 +119,6 @@ template <> struct BlockScalarTraits<Module> {
   }
 };
 
-} // end namespace llvm::yaml
-
 static void printRegMIR(Register Reg, yaml::StringValue &Dest,
                         const TargetRegisterInfo *TRI) {
   raw_string_ostream OS(Dest.Value);
@@ -866,48 +862,46 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
 
   OS << TII->getName(MI.getOpcode());
 
-  LS = ListSeparator();
+  // Print a space after the opcode if any additional tokens are printed.
+  LS = ListSeparator(", ", " ");
 
-  if (I < E) {
-    OS << ' ';
-    for (; I < E; ++I) {
-      OS << LS;
-      printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
-                     PrintedTypes, MRI, /*PrintDef=*/true);
-    }
+  for (; I < E; ++I) {
+    OS << LS;
+    printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
+                   PrintedTypes, MRI, /*PrintDef=*/true);
   }
 
   // Print any optional symbols attached to this instruction as-if they were
   // operands.
   if (MCSymbol *PreInstrSymbol = MI.getPreInstrSymbol()) {
-    OS << LS << " pre-instr-symbol ";
+    OS << LS << "pre-instr-symbol ";
     MachineOperand::printSymbol(OS, *PreInstrSymbol);
   }
   if (MCSymbol *PostInstrSymbol = MI.getPostInstrSymbol()) {
-    OS << LS << " post-instr-symbol ";
+    OS << LS << "post-instr-symbol ";
     MachineOperand::printSymbol(OS, *PostInstrSymbol);
   }
   if (MDNode *HeapAllocMarker = MI.getHeapAllocMarker()) {
-    OS << LS << " heap-alloc-marker ";
+    OS << LS << "heap-alloc-marker ";
     HeapAllocMarker->printAsOperand(OS, State.MST);
   }
   if (MDNode *PCSections = MI.getPCSections()) {
-    OS << LS << " pcsections ";
+    OS << LS << "pcsections ";
     PCSections->printAsOperand(OS, State.MST);
   }
   if (MDNode *MMRA = MI.getMMRAMetadata()) {
-    OS << LS << " mmra ";
+    OS << LS << "mmra ";
     MMRA->printAsOperand(OS, State.MST);
   }
   if (uint32_t CFIType = MI.getCFIType())
-    OS << LS << " cfi-type " << CFIType;
+    OS << LS << "cfi-type " << CFIType;
 
   if (auto Num = MI.peekDebugInstrNum())
-    OS << LS << " debug-instr-number " << Num;
+    OS << LS << "debug-instr-number " << Num;
 
   if (PrintLocations) {
     if (const DebugLoc &DL = MI.getDebugLoc()) {
-      OS << LS << " debug-location ";
+      OS << LS << "debug-location ";
       DL->printAsOperand(OS, State.MST);
     }
   }
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index b2731b69..a72c2c4 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -97,7 +97,9 @@ static const bool EnableDevelopmentFeatures = false;
 /// this happens only in development mode. It's a no-op otherwise.
 namespace llvm {
 extern cl::opt<unsigned> EvictInterferenceCutoff;
+} // namespace llvm
 
+namespace {
 class RegAllocScoring : public MachineFunctionPass {
 public:
   static char ID;
@@ -124,11 +126,12 @@ public:
   /// Performs this pass
   bool runOnMachineFunction(MachineFunction &) override;
 };
+} // namespace
 
 char RegAllocScoring::ID = 0;
-FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
-
-} // namespace llvm
+FunctionPass *llvm::createRegAllocScoringPass() {
+  return new RegAllocScoring();
+}
 
 INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
                 "Register Allocation Scoring Pass", false, false)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index e7fa082..26eb10f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-block-freq"
 
-namespace llvm {
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how machine block "
@@ -44,6 +43,7 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
                clEnumValN(GVDT_Count, "count", "display a graph using the real "
                                                "profile count if available.")));
 
+namespace llvm {
 // Similar option above, but used to control BFI display only after MBP pass
 cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
     "view-block-layout-with-bfi", cl::Hidden,
@@ -69,15 +69,15 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
-static cl::opt<bool> PrintMachineBlockFreq(
-    "print-machine-bfi", cl::init(false), cl::Hidden,
-    cl::desc("Print the machine block frequency info."));
-
 // Command line option to specify the name of the function for block frequency
 // dump. Defined in Analysis/BlockFrequencyInfo.cpp.
 extern cl::opt<std::string> PrintBFIFuncName;
 } // namespace llvm
 
+static cl::opt<bool>
+    PrintMachineBlockFreq("print-machine-bfi", cl::init(false), cl::Hidden,
+                          cl::desc("Print the machine block frequency info."));
+
 static GVDAGType getGVDT() {
   if (ViewBlockLayoutWithBFI != GVDT_None)
     return ViewBlockLayoutWithBFI;
@@ -85,9 +85,7 @@ static GVDAGType getGVDT() {
   return ViewMachineBlockFreqPropagationDAG;
 }
 
-namespace llvm {
-
-template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct llvm::GraphTraits<MachineBlockFrequencyInfo *> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
   using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
@@ -116,7 +114,7 @@ using MBFIDOTGraphTraitsBase =
                           MachineBranchProbabilityInfo>;
 
 template <>
-struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+struct llvm::DOTGraphTraits<MachineBlockFrequencyInfo *>
     : public MBFIDOTGraphTraitsBase {
   const MachineFunction *CurFunc = nullptr;
   DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
@@ -159,8 +157,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
   }
 };
 
-} // end namespace llvm
-
 AnalysisKey MachineBlockFrequencyAnalysis::Key;
 
 MachineBlockFrequencyAnalysis::Result
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 2e92dd8..7ca4582 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,13 +18,8 @@
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
-                      "machine-branch-prob",
-                      "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
-                    "machine-branch-prob",
-                    "Machine Branch Probability Analysis", false, true)
-
+INITIALIZE_PASS(MachineBranchProbabilityInfoWrapperPass, "machine-branch-prob",
+                "Machine Branch Probability Analysis", false, true)
 namespace llvm {
 cl::opt<unsigned>
     StaticLikelyProb("static-likely-prob",
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 224231c..bfa5ab2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -719,43 +719,41 @@ MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
   }
 }
 
-namespace llvm {
+template <>
+struct llvm::DOTGraphTraits<const MachineFunction *>
+    : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-  template<>
-  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
-    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  static std::string getGraphName(const MachineFunction *F) {
+    return ("CFG for '" + F->getName() + "' function").str();
+  }
 
-    static std::string getGraphName(const MachineFunction *F) {
-      return ("CFG for '" + F->getName() + "' function").str();
+  std::string getNodeLabel(const MachineBasicBlock *Node,
+                           const MachineFunction *Graph) {
+    std::string OutStr;
+    {
+      raw_string_ostream OSS(OutStr);
+
+      if (isSimple()) {
+        OSS << printMBBReference(*Node);
+        if (const BasicBlock *BB = Node->getBasicBlock())
+          OSS << ": " << BB->getName();
+      } else
+        Node->print(OSS);
     }
 
-    std::string getNodeLabel(const MachineBasicBlock *Node,
-                             const MachineFunction *Graph) {
-      std::string OutStr;
-      {
-        raw_string_ostream OSS(OutStr);
-
-        if (isSimple()) {
-          OSS << printMBBReference(*Node);
-          if (const BasicBlock *BB = Node->getBasicBlock())
-            OSS << ": " << BB->getName();
-        } else
-          Node->print(OSS);
-      }
-
-      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
-      // Process string output to make it nicer...
-      for (unsigned i = 0; i != OutStr.length(); ++i)
-        if (OutStr[i] == '\n') {                            // Left justify
-          OutStr[i] = '\\';
-          OutStr.insert(OutStr.begin()+i+1, 'l');
-        }
-      return OutStr;
-    }
-  };
+    if (OutStr[0] == '\n')
+      OutStr.erase(OutStr.begin());
 
-} // end namespace llvm
+    // Process string output to make it nicer...
+    for (unsigned i = 0; i != OutStr.length(); ++i)
+      if (OutStr[i] == '\n') { // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin() + i + 1, 'l');
+      }
+    return OutStr;
+  }
+};
 
 void MachineFunction::viewCFG() const
 {
diff --git a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0f88a7b..5111322 100644
--- a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -60,13 +60,11 @@ char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
 INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
-namespace llvm {
 /// Returns a newly-created MachineFunction Printer pass. The
 /// default banner is empty.
 ///
-MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
-                                                      const std::string &Banner){
+MachineFunctionPass *
+llvm::createMachineFunctionPrinterPass(raw_ostream &OS,
+                                       const std::string &Banner) {
   return new MachineFunctionPrinterPass(OS, Banner);
 }
-
-}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index fdae3b4..9feb974 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -593,15 +593,12 @@ struct MachineOutliner : public ModulePass {
 
 char MachineOutliner::ID = 0;
 
-namespace llvm {
-ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
+ModulePass *llvm::createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
   MachineOutliner *OL = new MachineOutliner();
   OL->RunOutlinerMode = RunOutlinerMode;
   return OL;
 }
 
-} // namespace llvm
-
 INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
                 false)
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 89ed4da..a717d9e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -201,16 +201,15 @@ static cl::opt<unsigned> SwpMaxNumStores(
     cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
     cl::init(200));
 
-namespace llvm {
-
 // A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                                 cl::init(true),
-                                 cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool>
+    llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                             cl::init(true),
+                             cl::desc("Enable CopyToPhi DAG Mutation"));
 
 /// A command line argument to force pipeliner to use specified issue
 /// width.
-cl::opt<int> SwpForceIssueWidth(
+cl::opt<int> llvm::SwpForceIssueWidth(
     "pipeliner-force-issue-width",
     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
     cl::init(-1));
@@ -226,8 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
                clEnumValN(WindowSchedulingFlag::WS_Force, "force",
                           "Use window algorithm instead of SMS algorithm.")));
 
-} // end namespace llvm
-
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
 #ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc4..3ed1045 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -176,9 +176,7 @@ STATISTIC(NumNodeOrderPostRA,
 STATISTIC(NumFirstValidPostRA,
           "Number of scheduling units chosen for FirstValid heuristic post-RA");
 
-namespace llvm {
-
-cl::opt<MISched::Direction> PreRADirection(
+cl::opt<MISched::Direction> llvm::PreRADirection(
     "misched-prera-direction", cl::Hidden,
     cl::desc("Pre reg-alloc list scheduling direction"),
     cl::init(MISched::Unspecified),
@@ -206,33 +204,31 @@ static cl::opt<bool>
     DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                            cl::desc("Print critical path length to stdout"));
 
-cl::opt<bool> VerifyScheduling(
+cl::opt<bool> llvm::VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
 #ifndef NDEBUG
-cl::opt<bool> ViewMISchedDAGs(
+cl::opt<bool> llvm::ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
     cl::desc("Pop up a window to show MISched dags after they are processed"));
-cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
-                        cl::desc("Print schedule DAGs"));
-cl::opt<bool> MISchedDumpReservedCycles(
+cl::opt<bool> llvm::PrintDAGs("misched-print-dags", cl::Hidden,
+                              cl::desc("Print schedule DAGs"));
+static cl::opt<bool> MISchedDumpReservedCycles(
     "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
     cl::desc("Dump resource usage at schedule boundary."));
-cl::opt<bool> MischedDetailResourceBooking(
+static cl::opt<bool> MischedDetailResourceBooking(
     "misched-detail-resource-booking", cl::Hidden, cl::init(false),
     cl::desc("Show details of invoking getNextResoufceCycle."));
 #else
-const bool ViewMISchedDAGs = false;
-const bool PrintDAGs = false;
-const bool MischedDetailResourceBooking = false;
+const bool llvm::ViewMISchedDAGs = false;
+const bool llvm::PrintDAGs = false;
+static const bool MischedDetailResourceBooking = false;
 #ifdef LLVM_ENABLE_DUMP
-const bool MISchedDumpReservedCycles = false;
+static const bool MISchedDumpReservedCycles = false;
 #endif // LLVM_ENABLE_DUMP
 #endif // NDEBUG
 
-} // end namespace llvm
-
 #ifndef NDEBUG
 /// In some situations a few uninteresting nodes depend on nearly all other
 /// nodes in the graph, provide a cutoff to hide them.
@@ -2053,28 +2049,24 @@ public:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI,
-                             bool ReorderWhileClustering) {
+llvm::createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+                                   const TargetRegisterInfo *TRI,
+                                   bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI,
-                              bool ReorderWhileClustering) {
+llvm::createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI,
+                                    bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
-} // end namespace llvm
-
 // Sorting all the loads/stores first, then for each load/store, checking the
 // following load/store one by one, until reach the first non-dependent one and
 // call target hook to see if they can cluster.
@@ -2304,16 +2296,12 @@ protected:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                               const TargetRegisterInfo *TRI) {
+llvm::createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+                                     const TargetRegisterInfo *TRI) {
   return std::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // end namespace llvm
-
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
 /// I0:     = dst
@@ -3445,14 +3433,13 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-namespace llvm {
 /// Return true if this heuristic determines order.
 /// TODO: Consider refactor return type of these functions as integer or enum,
 /// as we may need to differentiate whether TryCand is better than Cand.
-bool tryLess(int TryVal, int CandVal,
-             GenericSchedulerBase::SchedCandidate &TryCand,
-             GenericSchedulerBase::SchedCandidate &Cand,
-             GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryLess(int TryVal, int CandVal,
+                   GenericSchedulerBase::SchedCandidate &TryCand,
+                   GenericSchedulerBase::SchedCandidate &Cand,
+                   GenericSchedulerBase::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3465,10 +3452,10 @@ bool tryLess(int TryVal, int CandVal,
   return false;
 }
 
-bool tryGreater(int TryVal, int CandVal,
-                GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryGreater(int TryVal, int CandVal,
+                      GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      GenericSchedulerBase::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3481,9 +3468,9 @@ bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                SchedBoundary &Zone) {
+bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      SchedBoundary &Zone) {
   if (Zone.isTop()) {
     // Prefer the candidate with the lesser depth, but only if one of them has
     // depth greater than the total latency scheduled so far, otherwise either
@@ -3513,7 +3500,6 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
   }
   return false;
 }
-} // end namespace llvm
 
 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
                       bool IsPostRA = false) {
@@ -3798,14 +3784,12 @@ void GenericScheduler::registerRoots() {
   }
 }
 
-namespace llvm {
-bool tryPressure(const PressureChange &TryP,
-                 const PressureChange &CandP,
-                 GenericSchedulerBase::SchedCandidate &TryCand,
-                 GenericSchedulerBase::SchedCandidate &Cand,
-                 GenericSchedulerBase::CandReason Reason,
-                 const TargetRegisterInfo *TRI,
-                 const MachineFunction &MF) {
+bool llvm::tryPressure(const PressureChange &TryP, const PressureChange &CandP,
+                       GenericSchedulerBase::SchedCandidate &TryCand,
+                       GenericSchedulerBase::SchedCandidate &Cand,
+                       GenericSchedulerBase::CandReason Reason,
+                       const TargetRegisterInfo *TRI,
+                       const MachineFunction &MF) {
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -3838,7 +3822,7 @@ bool tryPressure(const PressureChange &TryP,
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
 
-unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned llvm::getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
 
@@ -3849,7 +3833,7 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
 /// with the operation that produces or consumes the physreg. We'll do this when
 /// regalloc has support for parallel copies.
-int biasPhysReg(const SUnit *SU, bool isTop) {
+int llvm::biasPhysReg(const SUnit *SU, bool isTop) {
   const MachineInstr *MI = SU->getInstr();
 
   if (MI->isCopy()) {
@@ -3884,7 +3868,6 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
 
   return 0;
 }
-} // end namespace llvm
 
 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
@@ -4812,13 +4795,13 @@ static MachineSchedRegistry ShufflerRegistry(
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-namespace llvm {
 
-template<> struct GraphTraits<
-  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+template <>
+struct llvm::GraphTraits<ScheduleDAGMI *> : public GraphTraits<ScheduleDAG *> {
+};
 
-template<>
-struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAGMI *> : public DefaultDOTGraphTraits {
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
@@ -4878,7 +4861,6 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 };
 
-} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c2d4aa0..9ac3f741 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -485,10 +485,7 @@ struct LoopBounds {
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
-namespace llvm {
-
-template<>
-class po_iterator_storage<LoopBounds, true> {
+template <> class llvm::po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
 
 public:
@@ -519,8 +516,6 @@ public:
   }
 };
 
-} // end namespace llvm
-
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 087ac62..59c587c 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -9,7 +9,7 @@
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
 #include "llvm/ADT/STLExtras.h"
 
-namespace llvm {
+using namespace llvm;
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
   auto I = Strings.try_emplace(S);
@@ -43,5 +43,3 @@ NonRelocatableStringpool::getEntriesForEmission() const {
   });
   return Result;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6f373a5..e9ffa85 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -76,8 +76,6 @@ using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safe-stack"
 
-namespace llvm {
-
 STATISTIC(NumFunctions, "Total number of functions");
 STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
 STATISTIC(NumUnsafeStackRestorePointsFunctions,
@@ -89,8 +87,6 @@ STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
 STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
-} // namespace llvm
-
 /// Use __safestack_pointer_address even if the platform has a faster way of
 /// access safe stack pointer.
 static cl::opt<bool>
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index eae2e8c..9662511 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,14 +1551,10 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
-namespace llvm {
-
-LLVM_ATTRIBUTE_UNUSED
-raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+[[maybe_unused]]
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
   Val.print(OS);
   return OS;
 }
 
-} // end namespace llvm
-
 #endif
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index e7b1494..c80eade 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -16,57 +16,51 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace llvm {
-  template<>
-  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAG *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-    static std::string getGraphName(const ScheduleDAG *G) {
-      return std::string(G->MF.getName());
-    }
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return std::string(G->MF.getName());
+  }
 
-    static bool renderGraphFromBottomUp() {
-      return true;
-    }
+  static bool renderGraphFromBottomUp() { return true; }
 
-    static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
-      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
-    }
+  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+  }
 
-    static std::string getNodeIdentifierLabel(const SUnit *Node,
-                                              const ScheduleDAG *Graph) {
-      std::string R;
-      raw_string_ostream OS(R);
-      OS << static_cast<const void *>(Node);
-      return R;
-    }
+  static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                            const ScheduleDAG *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
 
-    /// If you want to override the dot attributes printed for a particular
-    /// edge, override this method.
-    static std::string getEdgeAttributes(const SUnit *Node,
-                                         SUnitIterator EI,
-                                         const ScheduleDAG *Graph) {
-      if (EI.isArtificialDep())
-        return "color=cyan,style=dashed";
-      if (EI.isCtrlDep())
-        return "color=blue,style=dashed";
-      return "";
-    }
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
 
+  std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const ScheduleDAG *Graph) {
+    return "shape=Mrecord";
+  }
 
-    std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
-    static std::string getNodeAttributes(const SUnit *N,
-                                         const ScheduleDAG *Graph) {
-      return "shape=Mrecord";
-    }
-
-    static void addCustomGraphFeatures(ScheduleDAG *G,
-                                       GraphWriter<ScheduleDAG*> &GW) {
-      return G->addCustomGraphFeatures(GW);
-    }
-  };
-}
+  static void addCustomGraphFeatures(ScheduleDAG *G,
+                                     GraphWriter<ScheduleDAG *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
 
 std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
                                                        const ScheduleDAG *G) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1accdd..787a81a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -509,6 +509,7 @@ namespace {
     SDValue visitFMUL(SDNode *N);
     template <class MatchContextClass> SDValue visitFMA(SDNode *N);
     SDValue visitFMAD(SDNode *N);
+    SDValue visitFMULADD(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFSQRT(SDNode *N);
@@ -1991,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FMUL:               return visitFMUL(N);
   case ISD::FMA:                return visitFMA<EmptyMatchContext>(N);
   case ISD::FMAD:               return visitFMAD(N);
+  case ISD::FMULADD:            return visitFMULADD(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FSQRT:              return visitFSQRT(N);
@@ -5042,7 +5044,6 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
 
   unsigned Opc = N->getOpcode();
   bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // X / undef -> undef
   // X % undef -> undef
@@ -5074,7 +5075,7 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
   // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
   // it's a 1.
-  if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+  if (isOneOrOneSplat(N1) || (VT.getScalarType() == MVT::i1))
     return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
 
   return SDValue();
@@ -18444,6 +18445,21 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFMULADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Constant fold FMULADD.
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FMULADD, DL, VT, {N0, N1, N2}))
+    return C;
+
+  return SDValue();
+}
+
 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
 // reciprocal.
 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 08af74c..90edaf3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,8 +5063,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     break;
   case ISD::ADD:
   case ISD::ADDC:
-    // Add can have at most one carry bit.  Thus we know that the output
-    // is, at worst, one more bit than the inputs.
+    // TODO: Move Operand 1 check before Operand 0 check
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (Tmp == 1) return 1; // Early out.
 
@@ -5088,6 +5087,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     if (Tmp2 == 1) return 1; // Early out.
+
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
     return std::min(Tmp, Tmp2) - 1;
   case ISD::SUB:
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -5786,6 +5788,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOPYSIGN:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMULADD:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
@@ -5904,6 +5907,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOSH:
   case ISD::FTANH:
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (SNaN)
       return true;
@@ -6401,8 +6405,9 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
   if (VT.isScalableVector())
     return SDValue();
 
-  // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
-  // simplified to one big BUILD_VECTOR.
+  // A CONCAT_VECTOR of scalar sources, such as UNDEF, BUILD_VECTOR and
+  // single-element INSERT_VECTOR_ELT operands can be simplified to one big
+  // BUILD_VECTOR.
   // FIXME: Add support for SCALAR_TO_VECTOR as well.
   EVT SVT = VT.getScalarType();
   SmallVector<SDValue, 16> Elts;
@@ -6412,6 +6417,10 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
       Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
     else if (Op.getOpcode() == ISD::BUILD_VECTOR)
       Elts.append(Op->op_begin(), Op->op_end());
+    else if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+             OpVT.getVectorNumElements() == 1 &&
+             isNullConstant(Op.getOperand(2)))
+      Elts.push_back(Op.getOperand(1));
     else
       return SDValue();
   }
@@ -7231,7 +7240,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   }
 
   // Handle fma/fmad special cases.
-  if (Opcode == ISD::FMA || Opcode == ISD::FMAD) {
+  if (Opcode == ISD::FMA || Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
            Ops[2].getValueType() == VT && "FMA types must match!");
@@ -7242,7 +7251,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
       APFloat V1 = C1->getValueAPF();
       const APFloat &V2 = C2->getValueAPF();
       const APFloat &V3 = C3->getValueAPF();
-      if (Opcode == ISD::FMAD) {
+      if (Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
         V1.multiply(V2, APFloat::rmNearestTiesToEven);
         V1.add(V3, APFloat::rmNearestTiesToEven);
       } else
@@ -8781,7 +8790,7 @@ static SDValue getMemcpyLoadsAndStores(
       if (Value.getNode()) {
         Store = DAG.getStore(
             Chain, dl, Value,
-            DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
             DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
         OutChains.push_back(Store);
       }
@@ -8797,7 +8806,7 @@ static SDValue getMemcpyLoadsAndStores(
       assert(NVT.bitsGE(VT));
 
       bool isDereferenceable =
-        SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+          SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
       MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
       if (isDereferenceable)
         SrcMMOFlags |= MachineMemOperand::MODereferenceable;
@@ -8806,14 +8815,14 @@ static SDValue getMemcpyLoadsAndStores(
 
       Value = DAG.getExtLoad(
           ISD::EXTLOAD, dl, NVT, Chain,
-          DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+          DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
           SrcPtrInfo.getWithOffset(SrcOff), VT,
           commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
       OutLoadChains.push_back(Value.getValue(1));
 
       Store = DAG.getTruncStore(
           Chain, dl, Value,
-          DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+          DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
           DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
       OutStoreChains.push_back(Store);
     }
@@ -8943,14 +8952,14 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     SDValue Value;
 
     bool isDereferenceable =
-      SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+        SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
     MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
     if (isDereferenceable)
       SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
     Value = DAG.getLoad(
         VT, dl, Chain,
-        DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
         SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
@@ -8965,7 +8974,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
     Store = DAG.getStore(
         Chain, dl, LoadValues[i],
-        DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+        DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
         DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
     OutChains.push_back(Store);
     DstOff += VTSize;
@@ -9097,7 +9106,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(
         Chain, dl, Value,
-        DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+        DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
         DstPtrInfo.getWithOffset(DstOff), Alignment,
         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
         NewAAInfo);
@@ -11844,25 +11853,38 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                      ArrayRef<SDValue> Ops) {
+                                      ArrayRef<SDValue> Ops,
+                                      bool AllowCommute) {
   SDNodeFlags Flags;
   if (Inserter)
     Flags = Inserter->getFlags();
-  return getNodeIfExists(Opcode, VTList, Ops, Flags);
+  return getNodeIfExists(Opcode, VTList, Ops, Flags, AllowCommute);
 }
 
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                       ArrayRef<SDValue> Ops,
-                                      const SDNodeFlags Flags) {
-  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+                                      const SDNodeFlags Flags,
+                                      bool AllowCommute) {
+  if (VTList.VTs[VTList.NumVTs - 1] == MVT::Glue)
+    return nullptr;
+
+  auto Lookup = [&](ArrayRef<SDValue> LookupOps) -> SDNode * {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    AddNodeIDNode(ID, Opcode, VTList, LookupOps);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, IP)) {
       E->intersectFlagsWith(Flags);
       return E;
     }
-  }
+    return nullptr;
+  };
+
+  if (SDNode *Existing = Lookup(Ops))
+    return Existing;
+
+  if (AllowCommute && TLI->isCommutativeBinOp(Opcode))
+    return Lookup({Ops[1], Ops[0]});
+
   return nullptr;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c21890a..0f2b518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6996,6 +6996,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2)), Flags));
+    } else if (TLI.isOperationLegalOrCustom(ISD::FMULADD, VT)) {
+      // TODO: Support splitting the vector.
+      setValue(&I, DAG.getNode(ISD::FMULADD, sdl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)), Flags));
     } else {
       // TODO: Intrinsic calls should have fast-math-flags.
       SDValue Mul = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcfbfe6..39cbfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMA:                        return "fma";
   case ISD::STRICT_FMA:                 return "strict_fma";
   case ISD::FMAD:                       return "fmad";
+  case ISD::FMULADD:                    return "fmuladd";
   case ISD::FREM:                       return "frem";
   case ISD::STRICT_FREM:                return "strict_frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc503d3..920dff9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7676,6 +7676,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     break;
   }
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (!Flags.hasNoSignedZeros())
       break;
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 64e5cd5..95a9c3f 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -306,10 +306,7 @@ char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisLegacy::ID;
 INITIALIZE_PASS(StackFrameLayoutAnalysisLegacy, "stack-frame-layout",
                 "Stack Frame Layout", false, false)
 
-namespace llvm {
 /// Returns a newly-created StackFrameLayout pass.
-MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+MachineFunctionPass *llvm::createStackFrameLayoutAnalysisPass() {
   return new StackFrameLayoutAnalysisLegacy();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 53a9ab4..eac20120 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
 
   bool Changed = false;
   for (auto &GV : M.globals()) {
-    if (GV.isDeclarationForLinker())
+    if (!llvm::memprof::IsAnnotationOK(GV))
       continue;
 
-    // The implementation below assumes prior passes don't set section prefixes,
-    // and specifically do 'assign' rather than 'update'. So report error if a
-    // section prefix is already set.
-    if (auto maybeSectionPrefix = GV.getSectionPrefix();
-        maybeSectionPrefix && !maybeSectionPrefix->empty())
-      llvm::report_fatal_error("Global variable " + GV.getName() +
-                               " already has a section prefix " +
-                               *maybeSectionPrefix);
-
     StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI);
-    if (SectionPrefix.empty())
-      continue;
-
+    // setSectionPrefix returns true if the section prefix is updated.
     Changed |= GV.setSectionPrefix(SectionPrefix);
   }
 
diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp
index e22dc25..1593a40 100644
--- a/llvm/lib/CodeGen/StaticDataSplitter.cpp
+++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp
@@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op,
   if (Op.isGlobal()) {
     // Find global variables with local linkage.
     const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal());
-    // Skip 'llvm.'-prefixed global variables conservatively because they are
-    // often handled specially, and skip those not in static data
-    // sections.
-    if (!GV || GV->getName().starts_with("llvm.") ||
+    // Skip those not eligible for annotation or not in static data sections.
+    if (!GV || !llvm::memprof::IsAnnotationOK(*GV) ||
         !inStaticDataSection(*GV, TM))
       return nullptr;
     return GV;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c23281a..060b1dd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -815,7 +815,8 @@ void TargetLoweringBase::initActions() {
                         ISD::FTAN,           ISD::FACOS,
                         ISD::FASIN,          ISD::FATAN,
                         ISD::FCOSH,          ISD::FSINH,
-                        ISD::FTANH,          ISD::FATAN2},
+                        ISD::FTANH,          ISD::FATAN2,
+                        ISD::FMULADD},
                        VT, Expand);
 
     // Overflow operations default to expand
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index c9e4618..971f822 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -102,10 +102,8 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
   return true;
 }
 
-namespace llvm {
-
-Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
-                   unsigned SubIdx, const MachineRegisterInfo *MRI) {
+Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
+                         unsigned SubIdx, const MachineRegisterInfo *MRI) {
   return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
       OS << "$noreg";
@@ -135,7 +133,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
   });
 }
 
-Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     // Generic printout when TRI is missing.
     if (!TRI) {
@@ -158,7 +156,7 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     if (Register::isVirtualRegister(Unit)) {
       OS << '%' << Register(Unit).virtRegIndex();
@@ -168,8 +166,9 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
-                              const TargetRegisterInfo *TRI) {
+Printable llvm::printRegClassOrBank(Register Reg,
+                                    const MachineRegisterInfo &RegInfo,
+                                    const TargetRegisterInfo *TRI) {
   return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
     if (RegInfo.getRegClassOrNull(Reg))
       OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
@@ -183,8 +182,6 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
   });
 }
 
-} // end namespace llvm
-
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 7a0256f..38e9ac5 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -629,6 +629,10 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
   size_t NumBefore = Gsym.getNumFunctionInfos();
   auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
     DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
+    // Apple uses DW_AT_GNU_dwo_id for things other than split DWARF.
+    if (IsMachO)
+      return ReturnDie;
+
     if (DwarfUnit.getDWOId()) {
       DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
       if (!DWOCU->isDWOUnit())
diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
index 33734b8..bb8d2cb 100644
--- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
@@ -90,7 +90,7 @@ void MapperJITLinkMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
       auto TotalSize = Seg.ContentSize + Seg.ZeroFillSize;
 
       Seg.Addr = NextSegAddr;
-      Seg.WorkingMem = Mapper->prepare(NextSegAddr, TotalSize);
+      Seg.WorkingMem = Mapper->prepare(G, NextSegAddr, TotalSize);
 
       NextSegAddr += alignTo(TotalSize, Mapper->getPageSize());
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index ea3b22a..7b327af 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -58,7 +58,8 @@ void InProcessMemoryMapper::reserve(size_t NumBytes,
       ExecutorAddrRange(ExecutorAddr::fromPtr(MB.base()), MB.allocatedSize()));
 }
 
-char *InProcessMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *InProcessMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+                                     size_t ContentSize) {
   return Addr.toPtr<char *>();
 }
 
@@ -324,7 +325,8 @@ void SharedMemoryMapper::reserve(size_t NumBytes,
 #endif
 }
 
-char *SharedMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *SharedMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+                                  size_t ContentSize) {
   auto R = Reservations.upper_bound(Addr);
   assert(R != Reservations.begin() && "Attempt to prepare unreserved range");
   R--;
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 51d2e21..5b87686 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/IR/ConstantFPRange.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -506,3 +507,168 @@ ConstantFPRange ConstantFPRange::sub(const ConstantFPRange &Other) const {
   // fsub X, Y = fadd X, (fneg Y)
   return add(Other.negate());
 }
+
+void ConstantFPRange::flushDenormals(DenormalMode::DenormalModeKind Mode) {
+  if (Mode == DenormalMode::IEEE)
+    return;
+  FPClassTest Class = classify();
+  if (!(Class & fcSubnormal))
+    return;
+
+  auto &Sem = getSemantics();
+  // PreserveSign: PosSubnormal -> PosZero, NegSubnormal -> NegZero
+  // PositiveZero: PosSubnormal -> PosZero, NegSubnormal -> PosZero
+  // Dynamic:      PosSubnormal -> PosZero, NegSubnormal -> NegZero/PosZero
+  bool ZeroLowerNegative =
+      Mode != DenormalMode::PositiveZero && (Class & fcNegSubnormal);
+  bool ZeroUpperNegative =
+      Mode == DenormalMode::PreserveSign && !(Class & fcPosSubnormal);
+  assert((ZeroLowerNegative || !ZeroUpperNegative) &&
+         "ZeroLower is greater than ZeroUpper.");
+  Lower = minnum(Lower, APFloat::getZero(Sem, ZeroLowerNegative));
+  Upper = maxnum(Upper, APFloat::getZero(Sem, ZeroUpperNegative));
+}
+
+/// Represent a contiguous range of values sharing the same sign.
+struct SameSignRange {
+  bool HasZero;
+  bool HasNonZero;
+  bool HasInf;
+  // The lower and upper bounds of the range (inclusive).
+  // The sign is dropped and infinities are excluded.
+  std::optional<std::pair<APFloat, APFloat>> FinitePart;
+
+  explicit SameSignRange(const APFloat &Lower, const APFloat &Upper)
+      : HasZero(Lower.isZero()), HasNonZero(!Upper.isZero()),
+        HasInf(Upper.isInfinity()) {
+    assert(!Lower.isNegative() && !Upper.isNegative() &&
+           "The sign should be dropped.");
+    assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+           "Empty set.");
+    if (!Lower.isInfinity())
+      FinitePart = {Lower,
+                    HasInf ? APFloat::getLargest(Lower.getSemantics()) : Upper};
+  }
+};
+
+/// Split the range into positive and negative components.
+static void splitPosNeg(const APFloat &Lower, const APFloat &Upper,
+                        std::optional<SameSignRange> &NegPart,
+                        std::optional<SameSignRange> &PosPart) {
+  assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+         "Non-NaN part is empty.");
+  if (Lower.isNegative() == Upper.isNegative()) {
+    if (Lower.isNegative())
+      NegPart = SameSignRange{abs(Upper), abs(Lower)};
+    else
+      PosPart = SameSignRange{Lower, Upper};
+    return;
+  }
+  auto &Sem = Lower.getSemantics();
+  NegPart = SameSignRange{APFloat::getZero(Sem), abs(Lower)};
+  PosPart = SameSignRange{APFloat::getZero(Sem), Upper};
+}
+
+ConstantFPRange ConstantFPRange::mul(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // 0 * inf = QNaN
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasInf;
+    ResMayBeQNaN |= RHS->HasZero && LHS->HasInf;
+    // NonZero * inf = inf
+    if ((LHS->HasInf && RHS->HasNonZero) || (RHS->HasInf && LHS->HasNonZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite * Finite
+    if (LHS->FinitePart && RHS->FinitePart) {
+      APFloat NewLower = LHS->FinitePart->first * RHS->FinitePart->first;
+      APFloat NewUpper = LHS->FinitePart->second * RHS->FinitePart->second;
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
+
+ConstantFPRange ConstantFPRange::div(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // inf / inf = QNaN 0 / 0 = QNaN
+    ResMayBeQNaN |= LHS->HasInf && RHS->HasInf;
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasZero;
+    // It is not straightforward to infer HasNonZeroFinite = HasFinite &&
+    // HasNonZero. By definitions we have:
+    //   HasFinite = HasNonZeroFinite || HasZero
+    //   HasNonZero = HasNonZeroFinite || HasInf
+    // Since the range is contiguous, if both HasFinite and HasNonZero are true,
+    // HasNonZeroFinite must be true.
+    bool LHSHasNonZeroFinite = LHS->FinitePart && LHS->HasNonZero;
+    bool RHSHasNonZeroFinite = RHS->FinitePart && RHS->HasNonZero;
+    // inf / Finite = inf FiniteNonZero / 0 = inf
+    if ((LHS->HasInf && RHS->FinitePart) ||
+        (LHSHasNonZeroFinite && RHS->HasZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite / inf = 0
+    if (LHS->FinitePart && RHS->HasInf) {
+      APFloat Zero = APFloat::getZero(Sem, /*Negative=*/Negative);
+      ResLower = minnum(ResLower, Zero);
+      ResUpper = maxnum(ResUpper, Zero);
+    }
+    // Finite / FiniteNonZero
+    if (LHS->FinitePart && RHSHasNonZeroFinite) {
+      assert(!RHS->FinitePart->second.isZero() &&
+             "Divisor should be non-zero.");
+      APFloat NewLower = LHS->FinitePart->first / RHS->FinitePart->second;
+      APFloat NewUpper = LHS->FinitePart->second /
+                         (RHS->FinitePart->first.isZero()
+                              ? APFloat::getSmallest(Sem, /*Negative=*/false)
+                              : RHS->FinitePart->first);
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 3842b1a..6a9ef2e 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -741,7 +741,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       assert(!CI2->isZero() && "And zero handled above");
       if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
         // If and'ing the address of a global with a constant, fold it.
-        if (CE1->getOpcode() == Instruction::PtrToInt &&
+        if ((CE1->getOpcode() == Instruction::PtrToInt ||
+             CE1->getOpcode() == Instruction::PtrToAddr) &&
             isa<GlobalValue>(CE1->getOperand(0))) {
           GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
 
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2c2950c..cbce8bd 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -667,8 +667,11 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
     if (CE->getOpcode() == Instruction::Sub) {
       ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
       ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
-      if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
-          RHS->getOpcode() == Instruction::PtrToInt) {
+      if (LHS && RHS &&
+          (LHS->getOpcode() == Instruction::PtrToInt ||
+           LHS->getOpcode() == Instruction::PtrToAddr) &&
+          (RHS->getOpcode() == Instruction::PtrToInt ||
+           RHS->getOpcode() == Instruction::PtrToAddr)) {
         Constant *LHSOp0 = LHS->getOperand(0);
         Constant *RHSOp0 = RHS->getOperand(0);
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 3f1cc1e..27d8294 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -4098,15 +4098,8 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
   return wrap(unwrap(B)->CreateGlobalString(Str, Name));
 }
 
-LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
-  Value *P = unwrap(MemAccessInst);
-  if (LoadInst *LI = dyn_cast<LoadInst>(P))
-    return LI->isVolatile();
-  if (StoreInst *SI = dyn_cast<StoreInst>(P))
-    return SI->isVolatile();
-  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
-    return AI->isVolatile();
-  return cast<AtomicCmpXchgInst>(P)->isVolatile();
+LLVMBool LLVMGetVolatile(LLVMValueRef Inst) {
+  return cast<Instruction>(unwrap(Inst))->isVolatile();
 }
 
 void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 614c3a9..15c0198 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1002,6 +1003,18 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
   return C;
 }
 
+Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
+                                                     Value *False,
+                                                     StringRef PassName,
+                                                     const Twine &Name) {
+  Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
+  if (auto *SI = dyn_cast<SelectInst>(Ret)) {
+    setExplicitlyUnknownBranchWeightsIfProfiled(
+        *SI, *SI->getParent()->getParent(), PassName);
+  }
+  return Ret;
+}
+
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
   return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 88e7c44..9060a89 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2965,8 +2965,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 11: {
-      // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize/AddrSize
-      // and SrcSize==DstSize
+      // inttoptr, ptrtoint/ptrtoaddr -> integer cast
       if (!DL)
         return 0;
       unsigned MidSize = secondOp == Instruction::PtrToAddr
@@ -2974,10 +2973,15 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
                              : DL->getPointerTypeSizeInBits(MidTy);
       unsigned SrcSize = SrcTy->getScalarSizeInBits();
       unsigned DstSize = DstTy->getScalarSizeInBits();
-      // TODO: Could also produce zext or trunc here.
-      if (SrcSize <= MidSize && SrcSize == DstSize)
-        return Instruction::BitCast;
-      return 0;
+      // If the middle size is smaller than both source and destination,
+      // an additional masking operation would be required.
+      if (MidSize < SrcSize && MidSize < DstSize)
+        return 0;
+      if (DstSize < SrcSize)
+        return Instruction::Trunc;
+      if (DstSize > SrcSize)
+        return Instruction::ZExt;
+      return Instruction::BitCast;
     }
     case 12:
       // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c9ff86b..c79a950 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -893,7 +893,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
       if (GV.hasInitializer()) {
         const Constant *Init = GV.getInitializer();
         const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
-        Check(InitArray, "wrong initalizer for intrinsic global variable",
+        Check(InitArray, "wrong initializer for intrinsic global variable",
               Init);
         for (Value *Op : InitArray->operands()) {
           Value *V = Op->stripPointerCasts();
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index eef8a21..6b7b4f1 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -122,14 +122,14 @@ ConfigManager::getDXContainerConfig() const {
   if (!Common.AddGnuDebugLink.empty() || !Common.SplitDWO.empty() ||
       !Common.AllocSectionsPrefix.empty() ||
       Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
-      !Common.DumpSection.empty() || !Common.KeepSection.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
-      Common.ExtractDWO || Common.OnlyKeepDebug || Common.StripAllGNU ||
-      Common.StripDWO || Common.StripDebug || Common.StripNonAlloc ||
-      Common.StripSections || Common.StripUnneeded ||
-      Common.DecompressDebugSections || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.KeepSection.empty() || !Common.SectionsToRename.empty() ||
+      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
+      !Common.SetSectionType.empty() || Common.ExtractDWO ||
+      Common.OnlyKeepDebug || Common.StripAllGNU || Common.StripDWO ||
+      Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
+      Common.StripUnneeded || Common.DecompressDebugSections ||
+      Common.GapFill != 0 || Common.PadTo != 0 ||
+      Common.ChangeSectionLMAValAll != 0 ||
       !Common.ChangeSectionAddress.empty()) {
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for DXContainer");
diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
index d7f3c0d..95ab3d9 100644
--- a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
+++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
@@ -9,8 +9,10 @@
 #include "llvm/ObjCopy/DXContainer/DXContainerObjcopy.h"
 #include "DXContainerReader.h"
 #include "DXContainerWriter.h"
+#include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/DXContainer/DXContainerConfig.h"
+#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -42,7 +44,47 @@ static Error extractPartAsObject(StringRef PartName, StringRef OutFilename,
                          "part '%s' not found", PartName.str().c_str());
 }
 
+static Error dumpPartToFile(StringRef PartName, StringRef Filename,
+                            StringRef InputFilename, Object &Obj) {
+  auto PartIter = llvm::find_if(
+      Obj.Parts, [&PartName](const Part &P) { return P.Name == PartName; });
+  if (PartIter == Obj.Parts.end())
+    return createFileError(Filename,
+                           std::make_error_code(std::errc::invalid_argument),
+                           "part '%s' not found", PartName.str().c_str());
+  ArrayRef<uint8_t> Contents = PartIter->Data;
+  // The DXContainer format is a bit odd because the part-specific headers are
+  // contained inside the part data itself. For parts that contain LLVM bitcode
+  // when we dump the part we want to skip the part-specific header so that we
+  // get a valid .bc file that we can inspect. All the data contained inside the
+  // program header is pulled out of the bitcode, so the header can be
+  // reconstructed if needed from the bitcode itself. More comprehensive
+  // documentation on the DXContainer format can be found at
+  // https://llvm.org/docs/DirectX/DXContainer.html.
+
+  if (PartName == "DXIL" || PartName == "STAT")
+    Contents = Contents.drop_front(sizeof(llvm::dxbc::ProgramHeader));
+  if (Contents.empty())
+    return createFileError(Filename, object_error::parse_failed,
+                           "part '%s' is empty", PartName.str().c_str());
+  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+      FileOutputBuffer::create(Filename, Contents.size());
+  if (!BufferOrErr)
+    return createFileError(Filename, BufferOrErr.takeError());
+  std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+  llvm::copy(Contents, Buf->getBufferStart());
+  if (Error E = Buf->commit())
+    return createFileError(Filename, std::move(E));
+  return Error::success();
+}
+
 static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+  for (StringRef Flag : Config.DumpSection) {
+    auto [SecName, FileName] = Flag.split("=");
+    if (Error E = dumpPartToFile(SecName, FileName, Config.InputFilename, Obj))
+      return E;
+  }
+
   // Extract all sections before any modifications.
   for (StringRef Flag : Config.ExtractSection) {
     StringRef SectionName;
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 53699ce0..f256e7b 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -837,7 +837,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
     Version = Data.getU8(Cur);
     if (!Cur)
       break;
-    if (Version < 2 || Version > 3)
+    if (Version < 2 || Version > 4)
       return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
                          Twine(static_cast<int>(Version)));
     Feature = Data.getU8(Cur); // Feature byte
@@ -852,6 +852,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
                          "callsite offsets feature is enabled: version = " +
                          Twine(static_cast<int>(Version)) +
                          " feature = " + Twine(static_cast<int>(Feature)));
+    if (FeatEnable.BBHash && Version < 4)
+      return createError("version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when "
+                         "basic block hash feature is enabled: version = " +
+                         Twine(static_cast<int>(Version)) +
+                         " feature = " + Twine(static_cast<int>(Feature)));
     uint32_t NumBlocksInBBRange = 0;
     uint32_t NumBBRanges = 1;
     typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0;
@@ -907,6 +912,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
           uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +
                           LastCallsiteEndOffset;
           uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+          uint64_t Hash = FeatEnable.BBHash ? Data.getU64(Cur) : 0;
           Expected<BBAddrMap::BBEntry::Metadata> MetadataOrErr =
               BBAddrMap::BBEntry::Metadata::decode(MD);
           if (!MetadataOrErr) {
@@ -914,7 +920,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
             break;
           }
           BBEntries.push_back({ID, Offset + PrevBBEndOffset, Size,
-                               *MetadataOrErr, CallsiteEndOffsets});
+                               *MetadataOrErr, CallsiteEndOffsets, Hash});
           PrevBBEndOffset += Offset + Size;
         }
         TotalNumBlocks += BBEntries.size();
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index faeeab3..8b75fbe 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1465,7 +1465,7 @@ void ELFState<ELFT>::writeSectionContent(
   for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
     // Write version and feature values.
     if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
-      if (E.Version > 3)
+      if (E.Version > 4)
         WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
                              << static_cast<int>(E.Version)
                              << "; encoding using the most recent version";
@@ -1526,6 +1526,12 @@ void ELFState<ELFT>::writeSectionContent(
         }
         SHeader.sh_size += CBA.writeULEB128(BBE.Size);
         SHeader.sh_size += CBA.writeULEB128(BBE.Metadata);
+        if (FeatureOrErr->BBHash || BBE.Hash.has_value()) {
+          uint64_t Hash =
+              BBE.Hash.has_value() ? BBE.Hash.value() : llvm::yaml::Hex64(0);
+          CBA.write<uint64_t>(Hash, ELFT::Endianness);
+          SHeader.sh_size += 8;
+        }
       }
     }
     if (!PGOAnalyses)
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index d9cce1e..421d6603 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1887,6 +1887,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
   IO.mapRequired("Size", E.Size);
   IO.mapRequired("Metadata", E.Metadata);
   IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets);
+  IO.mapOptional("Hash", E.Hash);
 }
 
 void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping(
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 6b65720..5ab1def 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -136,6 +136,13 @@ struct DebugCounterOwner : DebugCounter {
       cl::location(this->ShouldPrintCounter),
       cl::init(false),
       cl::desc("Print out debug counter info after all counters accumulated")};
+  cl::opt<bool, true> PrintDebugCounterQueries{
+      "print-debug-counter-queries",
+      cl::Hidden,
+      cl::Optional,
+      cl::location(this->ShouldPrintCounterQueries),
+      cl::init(false),
+      cl::desc("Print out each query of an enabled debug counter")};
   cl::opt<bool, true> BreakOnLastCount{
       "debug-counter-break-on-last",
       cl::Hidden,
@@ -221,31 +228,40 @@ void DebugCounter::print(raw_ostream &OS) const {
   }
 }
 
+bool DebugCounter::handleCounterIncrement(CounterInfo &Info) {
+  int64_t CurrCount = Info.Count++;
+  uint64_t CurrIdx = Info.CurrChunkIdx;
+
+  if (Info.Chunks.empty())
+    return true;
+  if (CurrIdx >= Info.Chunks.size())
+    return false;
+
+  bool Res = Info.Chunks[CurrIdx].contains(CurrCount);
+  if (BreakOnLast && CurrIdx == (Info.Chunks.size() - 1) &&
+      CurrCount == Info.Chunks[CurrIdx].End) {
+    LLVM_BUILTIN_DEBUGTRAP;
+  }
+  if (CurrCount > Info.Chunks[CurrIdx].End) {
+    Info.CurrChunkIdx++;
+
+    /// Handle consecutive blocks.
+    if (Info.CurrChunkIdx < Info.Chunks.size() &&
+        CurrCount == Info.Chunks[Info.CurrChunkIdx].Begin)
+      return true;
+  }
+  return Res;
+}
+
 bool DebugCounter::shouldExecuteImpl(unsigned CounterName) {
   auto &Us = instance();
   auto Result = Us.Counters.find(CounterName);
   if (Result != Us.Counters.end()) {
     auto &CounterInfo = Result->second;
-    int64_t CurrCount = CounterInfo.Count++;
-    uint64_t CurrIdx = CounterInfo.CurrChunkIdx;
-
-    if (CounterInfo.Chunks.empty())
-      return true;
-    if (CurrIdx >= CounterInfo.Chunks.size())
-      return false;
-
-    bool Res = CounterInfo.Chunks[CurrIdx].contains(CurrCount);
-    if (Us.BreakOnLast && CurrIdx == (CounterInfo.Chunks.size() - 1) &&
-        CurrCount == CounterInfo.Chunks[CurrIdx].End) {
-      LLVM_BUILTIN_DEBUGTRAP;
-    }
-    if (CurrCount > CounterInfo.Chunks[CurrIdx].End) {
-      CounterInfo.CurrChunkIdx++;
-
-      /// Handle consecutive blocks.
-      if (CounterInfo.CurrChunkIdx < CounterInfo.Chunks.size() &&
-          CurrCount == CounterInfo.Chunks[CounterInfo.CurrChunkIdx].Begin)
-        return true;
+    bool Res = Us.handleCounterIncrement(CounterInfo);
+    if (Us.ShouldPrintCounterQueries && CounterInfo.IsSet) {
+      dbgs() << "DebugCounter " << Us.RegisteredCounters[CounterName] << "="
+             << (CounterInfo.Count - 1) << (Res ? " execute" : " skip") << "\n";
     }
     return Res;
   }
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 80fd485..549c418 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -55,12 +55,20 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
+void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
+  if (BySize) {
+    llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) {
+      return A.Name.size() < B.Name.size();
+    });
+  }
+}
+
 void SpecialCaseList::RegexMatcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
   for (const auto &R : reverse(RegExes))
     if (R.Rg.match(Query))
-      Cb(R.Name, R.LineNo);
+      return Cb(R.Name, R.LineNo);
 }
 
 Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
@@ -75,12 +83,20 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
+void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
+  if (BySize) {
+    llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) {
+      return A.Name.size() < B.Name.size();
+    });
+  }
+}
+
 void SpecialCaseList::GlobMatcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
   for (const auto &G : reverse(Globs))
     if (G.Pattern.match(Query))
-      Cb(G.Name, G.LineNo);
+      return Cb(G.Name, G.LineNo);
 }
 
 SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
@@ -91,6 +107,14 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
     M.emplace<RegexMatcher>();
 }
 
+Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
+  return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
+}
+
+LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) {
+  return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
+}
+
 void SpecialCaseList::Matcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
@@ -99,10 +123,6 @@ void SpecialCaseList::Matcher::match(
   return std::visit([&](auto &V) { return V.match(Query, Cb); }, M);
 }
 
-Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
-  return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
-}
-
 // TODO: Refactor this to return Expected<...>
 std::unique_ptr<SpecialCaseList>
 SpecialCaseList::create(const std::vector<std::string> &Paths,
@@ -141,7 +161,7 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
       return false;
     }
     std::string ParseError;
-    if (!parse(i, FileOrErr.get().get(), ParseError)) {
+    if (!parse(i, FileOrErr.get().get(), ParseError, /*OrderBySize=*/false)) {
       Error = (Twine("error parsing file '") + Path + "': " + ParseError).str();
       return false;
     }
@@ -149,9 +169,9 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
   return true;
 }
 
-bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
-                                     std::string &Error) {
-  if (!parse(0, MB, Error))
+bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error,
+                                     bool OrderBySize) {
+  if (!parse(0, MB, Error, OrderBySize))
     return false;
   return true;
 }
@@ -174,7 +194,7 @@ SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
 }
 
 bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
-                            std::string &Error) {
+                            std::string &Error, bool OrderBySize) {
   unsigned long long Version = 2;
 
   StringRef Header = MB->getBuffer();
@@ -246,6 +266,10 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
       return false;
     }
   }
+
+  for (Section &S : Sections)
+    S.preprocess(OrderBySize);
+
   return true;
 }
 
@@ -283,6 +307,13 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix,
   return &II->second;
 }
 
+LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
+  SectionMatcher.preprocess(false);
+  for (auto &[K1, E] : Entries)
+    for (auto &[K2, M] : E)
+      M.preprocess(OrderBySize);
+}
+
 unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
                                                 StringRef Query,
                                                 StringRef Category) const {
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 804ff07..41f5187 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -54,9 +54,9 @@ static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
   return std::nullopt;
 }
 
-LLVM_ATTRIBUTE_UNUSED static void
-HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
-               SmallVectorImpl<char> &Result) {
+[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
+                                            size_t &OutputLength,
+                                            SmallVectorImpl<char> &Result) {
   // No space left in output buffer. Double the size of the underlying
   // memory in the SmallVectorImpl, adjust pointer and length and continue
   // the conversion.
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index 8d66348..6f8e091 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -476,7 +476,7 @@ nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
       1;
 
-  LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+  [[maybe_unused]] static std::size_t Rows =
       UnicodeNameToCodepointLargestNameSize + 1;
 
   std::vector<char> Distances(
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 2ea3a24..afce803 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1363,9 +1363,12 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
   }
   case LISTSPLAT: {
     const auto *Value = dyn_cast<TypedInit>(LHS);
-    const auto *Size = dyn_cast<IntInit>(RHS);
-    if (Value && Size) {
-      SmallVector<const Init *, 8> Args(Size->getValue(), Value);
+    const auto *Count = dyn_cast<IntInit>(RHS);
+    if (Value && Count) {
+      if (Count->getValue() < 0)
+        PrintFatalError(Twine("!listsplat count ") + Count->getAsString() +
+                        " is negative");
+      SmallVector<const Init *, 8> Args(Count->getValue(), Value);
       return ListInit::get(Args, Value->getType());
     }
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6965116..9926a4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26196,9 +26196,10 @@ static SDValue performFlagSettingCombine(SDNode *N,
     return DCI.CombineTo(N, Res, SDValue(N, 1));
   }
 
-  // Combine identical generic nodes into this node, re-using the result.
+  // Combine equivalent generic nodes into this node, re-using the result.
   if (SDNode *Generic = DCI.DAG.getNodeIfExists(
-          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
+          /*AllowCommute=*/true))
     DCI.CombineTo(Generic, SDValue(N, 0));
 
   return SDValue();
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558..7e03b97 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   }
 
   bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
-  // Assume we can't combine the last pop with the sp restore.
-  bool CombineAfterCSRBump = false;
+
+  unsigned ProloguePopSize = PrologueSaveSize;
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+    // that needs to be popped until we reach the start of the SVE save area.
+    // The "FixedObject" stack occurs after the SVE area and must be popped
+    // later.
+    ProloguePopSize -= FixedObject;
     AfterCSRPopSize += FixedObject;
-  } else if (!CombineSPBump && PrologueSaveSize != 0) {
+  }
+
+  // Assume we can't combine the last pop with the sp restore.
+  if (!CombineSPBump && ProloguePopSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
     while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
-           AArch64InstrInfo::isSEHInstruction(*Pop))
+           AArch64InstrInfo::isSEHInstruction(*Pop) ||
+           (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+            isPartOfSVECalleeSaves(Pop)))
       Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
@@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     // may clobber), convert it to a post-index ldp.
     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
       convertCalleeSaveRestoreToSPPrePostIncDec(
-          Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
-          PrologueSaveSize);
+          Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+          ProloguePopSize);
+    } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+      MachineBasicBlock::iterator AfterLastPop = std::next(Pop);
+      if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop))
+        ++AfterLastPop;
+      // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+      // callee-save non-SVE registers to move the stack pointer to the start of
+      // the SVE area.
+      emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(ProloguePopSize), TII,
+                      MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                      &HasWinCFI);
     } else {
-      // If not, make sure to emit an add after the last ldp.
+      // Otherwise, make sure to emit an add after the last ldp.
       // We're doing this by transferring the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
       // pops.
-      AfterCSRPopSize += PrologueSaveSize;
-      CombineAfterCSRBump = true;
+      AfterCSRPopSize += ProloguePopSize;
     }
   }
-
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     --SEHEpilogueStartI;
   }
 
+  // Determine the ranges of SVE callee-saves. This is done before emitting any
+  // code at the end of the epilogue (for Swift async), which can get in the way
+  // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  auto [PPRRange, ZPRRange] = partitionSVECS(
+      MBB,
+      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+          ? MBB.getFirstTerminator()
+          : FirstGPRRestoreI,
+      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
@@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
-  auto [PPR, ZPR] = getSVEStackFrameSizes();
-  auto [PPRRange, ZPRRange] = partitionSVECS(
-      MBB,
-      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
-          ? MBB.getFirstTerminator()
-          : FirstGPRRestoreI,
-      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
   StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
   StackOffset SVEStackSize =
       SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
@@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
                       NeedsWinCFI, &HasWinCFI);
     }
 
-    // Deallocate callee-save non-SVE registers.
-    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate fixed objects.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(FixedObject), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
     // Deallocate callee-save SVE registers.
     emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
                     SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
         StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
         false, NeedsWinCFI, &HasWinCFI, EmitCFI,
-        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+        StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2c3870c..636d4f8a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8217,6 +8217,8 @@ bool AArch64AsmParser::parseDataExpr(const MCExpr *&Res) {
       Spec = AArch64::S_GOTPCREL;
     else if (Identifier == "plt")
       Spec = AArch64::S_PLT;
+    else if (Identifier == "funcinit")
+      Spec = AArch64::S_FUNCINIT;
   }
   if (Spec == AArch64::S_None)
     return Error(Loc, "invalid relocation specifier");
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a388216..892b8da 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -232,6 +232,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
       }
       if (RefKind == AArch64::S_AUTH || RefKind == AArch64::S_AUTHADDR)
         return ELF::R_AARCH64_AUTH_ABS64;
+      if (RefKind == AArch64::S_FUNCINIT)
+        return ELF::R_AARCH64_FUNCINIT64;
       return ELF::R_AARCH64_ABS64;
     }
     case AArch64::fixup_aarch64_add_imm12:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 2b5cf34..bc090c6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -40,6 +40,7 @@ const MCAsmInfo::AtSpecifier ELFAtSpecifiers[] = {
     {AArch64::S_GOT, "GOT"},
     {AArch64::S_GOTPCREL, "GOTPCREL"},
     {AArch64::S_PLT, "PLT"},
+    {AArch64::S_FUNCINIT, "FUNCINIT"},
 };
 
 const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 0dfa61b..f2acff5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -164,6 +164,7 @@ enum {
   // ELF relocation specifiers in data directives:
   S_PLT          = 0x400,
   S_GOTPCREL,
+  S_FUNCINIT,
 
   // Mach-O @ relocation specifiers:
   S_MACHO_GOT,
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index dd6fa16..d71f728 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -130,6 +130,12 @@ SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
   if (auto *CalledFunction = CB.getCalledFunction())
     CalledFn = SMEAttrs(*CalledFunction, TLI);
 
+  // An `invoke` of an agnostic ZA function may not return normally (it may
+  // resume in an exception block). In this case, it acts like a private ZA
+  // callee and may require a ZA save to be set up before it is called.
+  if (isa<InvokeInst>(CB))
+    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
   assert((IsIndirect ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
-           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
-    Result = true;
+           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+    // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+    // For our purposes, these shall not be classified as VALU as this results
+    // in unexpected behavior.
+    Result = !MI.mayLoadOrStore();
+  }
 
   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
            TII->isSALU(MI))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
            TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f..1b559a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
     R = getMad(DAG, DL, VT, YH, CH, Mad1);
   }
 
-  const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
-                            (Flags.hasNoInfs() || Options.NoInfsFPMath);
+  const bool IsFiniteOnly =
+      (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
 
   // TODO: Check if known finite from source value.
   if (!IsFiniteOnly) {
@@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
       DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
 
   R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
-  const auto &Options = getTargetMachine().Options;
 
-  if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+  if (!Flags.hasNoInfs()) {
     SDValue OverflowCheckConst =
         DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
     SDValue Overflow =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee466ca..596a895 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
 
   const bool IsFiniteOnly =
       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
-      (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+      MI.getFlag(MachineInstr::FmNoInfs);
 
   if (!IsFiniteOnly) {
     // Expand isfinite(x) => fabs(x) < inf
@@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
 
   R = B.buildSelect(Ty, Underflow, Zero, R);
 
-  const auto &Options = MF.getTarget().Options;
-
-  if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+  if (!(Flags & MachineInstr::FmNoInfs)) {
     auto OverflowCheckConst =
         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be..4e11c4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
 #include "GCNRegPressure.h"
 #include "AMDGPU.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 
 using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
 
 GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
                                            const LiveIntervals &LIS,
-                                           const MachineRegisterInfo &MRI) {
+                                           const MachineRegisterInfo &MRI,
+                                           GCNRegPressure::RegKind RegKind) {
   GCNRPTracker::LiveRegSet LiveRegs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     auto Reg = Register::index2VirtReg(I);
+    if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+        GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+      continue;
     if (!LIS.hasInterval(Reg))
       continue;
     auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
 
 #undef PFX
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+                                               GCNRegPressure::RegKind Kind,
+                                               LiveIntervals &LIS,
+                                               const MachineLoopInfo *MLI) {
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  auto &OS = dbgs();
+  const char *RegName = GCNRegPressure::getName(Kind);
+
+  unsigned MaxNumRegs = 0;
+  const MachineInstr *MaxPressureMI = nullptr;
+  GCNUpwardRPTracker RPT(LIS);
+  for (const MachineBasicBlock &MBB : MF) {
+    RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+    for (const MachineInstr &MI : reverse(MBB)) {
+      RPT.recede(MI);
+      unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+      if (NumRegs > MaxNumRegs) {
+        MaxNumRegs = NumRegs;
+        MaxPressureMI = &MI;
+      }
+    }
+  }
+
+  SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+  // Max pressure can occur at either the early-clobber or register slot.
+  // Choose the maximum liveset between both slots. This is ugly but this is
+  // diagnostic code.
+  SlotIndex ECSlot = MISlot.getRegSlot(true);
+  SlotIndex RSlot = MISlot.getRegSlot(false);
+  GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+  GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+  unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+  unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+  GCNRPTracker::LiveRegSet *LiveSet =
+      ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+  SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+  assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+  // Split live registers into single-def and multi-def sets.
+  GCNRegPressure SDefPressure, MDefPressure;
+  SmallVector<Register, 16> SDefRegs, MDefRegs;
+  for (auto [Reg, LaneMask] : *LiveSet) {
+    assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+    LiveInterval &LI = LIS.getInterval(Reg);
+    if (LI.getNumValNums() == 1 ||
+        (LI.hasSubRanges() &&
+         llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+           return SR.getNumValNums() == 1;
+         }))) {
+      SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+      SDefRegs.push_back(Reg);
+    } else {
+      MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+      MDefRegs.push_back(Reg);
+    }
+  }
+  unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+  unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+  assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+  auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+    return Printable([&, MBB, SI](raw_ostream &OS) {
+      OS << SI << ':' << printMBBReference(*MBB);
+      if (MLI)
+        if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+          OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+             << ", Depth " << ML->getLoopDepth() << ")";
+    });
+  };
+
+  auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+    GCNRegPressure RegPressure;
+    RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+    OS << "  " << printReg(Reg, TRI) << ':'
+       << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+       << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+       << RegName << "s)\n";
+
+    // Use std::map to sort def/uses by SlotIndex.
+    std::map<SlotIndex, const MachineInstr *> Instrs;
+    for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+      Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+    }
+
+    for (const auto &[SI, MI] : Instrs) {
+      OS << "    ";
+      if (MI->definesRegister(Reg, TRI))
+        OS << "def ";
+      if (MI->readsRegister(Reg, TRI))
+        OS << "use ";
+      OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+    }
+  };
+
+  OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+     << " ***\n";
+  OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+     << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+     << *MaxPressureMI;
+
+  OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+     << RegName << "s):\n";
+
+  // Sort SDefRegs by number of uses (smallest first)
+  llvm::sort(SDefRegs, [&](Register A, Register B) {
+    return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+           std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+  });
+
+  for (const Register Reg : SDefRegs) {
+    PrintRegInfo(Reg, LiveSet->lookup(Reg));
+  }
+
+  OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+     << RegName << "s):\n";
+  for (const Register Reg : MDefRegs) {
+    PrintRegInfo(Reg, LiveSet->lookup(Reg));
+  }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ff..979a8b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
 struct GCNRegPressure {
   enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
 
+  static constexpr const char *getName(RegKind Kind) {
+    const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+    assert(Kind < TOTAL_KINDS);
+    return Names[Kind];
+  }
+
   GCNRegPressure() {
     clear();
   }
@@ -41,6 +47,11 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
 
+  unsigned getNumRegs(RegKind Kind) const {
+    assert(Kind < TOTAL_KINDS);
+    return Value[Kind];
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
 
   void dump() const;
 
+  static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+    return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+  }
+
 private:
   static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
 
@@ -294,8 +311,10 @@ public:
   }
 };
 
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
-                                     const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+            const MachineRegisterInfo &MRI,
+            GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
 
 ////////////////////////////////////////////////////////////////////////////////
 // GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
                             const MachineRegisterInfo &MRI,
                             LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
 
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
-                                     const MachineRegisterInfo &MRI);
-
 /// creates a map MachineInstr -> LiveRegSet
 /// R - range of iterators on instructions
 /// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ public:
   }
 };
 
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+                                 GCNRegPressure::RegKind Kind,
+                                 LiveIntervals &LIS,
+                                 const MachineLoopInfo *MLI);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc0810..58482ea 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+    "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+    cl::desc("Print a list of live registers along with their def/uses at the "
+             "point of maximum register pressure before scheduling."),
+    cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+    "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+    cl::desc("Print a list of live registers along with their def/uses at the "
+             "point of maximum register pressure after scheduling."),
+    cl::init(false));
+#endif
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
       RegionLiveOuts.buildLiveRegMap();
   }
 
+#ifdef DUMP_MAX_REG_PRESSURE
+  if (PrintMaxRPRegUsageBeforeScheduler) {
+    dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+    dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+    LIS->dump();
+  }
+#endif
+
   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
   while (S.advanceStage()) {
     auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
 
     Stage->finalizeGCNSchedStage();
   }
+
+#ifdef DUMP_MAX_REG_PRESSURE
+  if (PrintMaxRPRegUsageAfterScheduler) {
+    dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+    dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+    LIS->dump();
+  }
+#endif
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
 }
 
 class MIMGNSAHelper<int num_addrs,
-                    list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
-  : NSAHelper<> {
+                    list<RegisterOperand> addr_types_in=[]>
+    : NSAHelper<> {
+  list<RegisterOperand> addr_types =
+    !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+        addr_types_in);
+
   list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
   let AddrIns = !dag(ins, addr_types, AddrAsmNames);
   let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
 // Base class for all NSA MIMG instructions.
 // Note that 1-dword addresses always use non-NSA variants.
 class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
-                     list<RegisterClass> addr_types=[],
+                     list<RegisterOperand> addr_types=[],
                      RegisterOperand LastAddrRC = VGPROp_32>
   : MIMG<outs, dns>, MIMGe_gfx11<op> {
   let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
 }
 
 class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
-                   list<RegisterClass> addr_types=[]>
+                   list<RegisterOperand> addr_types=[]>
   : VIMAGE<outs, dns>, VIMAGEe<op> {
   let SubtargetPredicate = isGFX12Plus;
   let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
   int VAddrDwords = !srl(Size, 5);
 
   int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
-  RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
-  list<RegisterClass> GFX11PlusAddrTypes =
-     !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
-           isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
-           IsA16  : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
-           true   : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+  RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+  list<RegisterOperand> GFX11PlusAddrTypes =
+     !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+           isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+           IsA16  : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+           true   : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
 }
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
 }
 
 class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
-                                  list<RegisterClass> addr_types>
+                                  list<RegisterOperand> addr_types>
     : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
                      addr_types> {
   let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
 
 class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
                                 bit isDual, bit isBVH8,
-                                list<RegisterClass> addr_types>
+                                list<RegisterOperand> addr_types>
     : VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
                                  (outs VReg_320:$vdata, VReg_96:$ray_origin_out,
                                        VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80e985d..a2841c11 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18168,7 +18168,7 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   return CacheLineAlign;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
   assert(N->getOpcode() == ISD::CopyFromReg);
   do {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5e27b37..6dcbced 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       // SMEM and VMEM operations. So there will never be
       // outstanding address translations for both SMEM and
       // VMEM at the same time.
-      setScoreLB(T, CurrScore - 1);
+      setScoreLB(T, getScoreUB(T) - 1);
       PendingEvents &= ~(1 << OtherEvent);
     }
     for (const MachineOperand &Op : Inst.all_uses())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ec5c5bb3..d516330 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -865,22 +865,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    if (DestReg == AMDGPU::VCC_LO) {
-      if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      } else {
+    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      if (DestReg == AMDGPU::VCC_LO) {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        return;
       }
 
-      return;
-    }
-
-    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -898,22 +892,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    if (DestReg == AMDGPU::VCC) {
-      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      } else {
+    if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
+      if (DestReg == AMDGPU::VCC) {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        return;
       }
 
-      return;
-    }
-
-    if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index eac9fd4..27e5ee9c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3726,6 +3726,23 @@ def : GCNPat <
 } // End foreach Ty = ...
 } // End AddedComplexity = 1
 
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+  (i32 (DivergentBinFrag<or>
+    (i32 (zext i16:$src_lo)),
+    (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi)))))
+  )),
+  (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+def : GCNPat<
+  (i32 (DivergentBinFrag<or>
+    (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))),
+    (i32 (zext i16:$src_lo))
+  )),
+  (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+}
+
 let True16Predicate = UseRealTrue16Insts in
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 39e651d..8945ec3 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -166,7 +166,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
 }
 
 // TODO
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static DecodeStatus DecodesFPR128RegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index ca81d30..8ace2d2 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cstdint>
 #include <optional>
 
 using namespace llvm;
@@ -193,7 +194,12 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        BindInfo.UpperBound = Binding.LowerBound + Binding.Size - 1;
+        assert(Binding.Size == UINT32_MAX ||
+               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
+                   "Resource range is too large");
+        BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
+                                  ? UINT32_MAX
+                                  : Binding.LowerBound + Binding.Size - 1;
         BindInfo.Space = Binding.Space;
         BindInfo.Kind = static_cast<dxbc::PSV::ResourceKind>(Kind);
         BindInfo.Flags = Flags;
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 974f653..5f180d6 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -667,11 +667,10 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
 }
 
-LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily.
-    static DecodeStatus
-    DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t /*Address*/,
-                              const MCDisassembler *Decoder) {
+[[maybe_unused]] // Suppress warning temporarily.
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t /*Address*/,
+                                              const MCDisassembler *Decoder) {
   static const MCPhysReg HvxVQRDecoderTable[] = {
       Hexagon::VQ0,  Hexagon::VQ1,  Hexagon::VQ2,  Hexagon::VQ3,
       Hexagon::VQ4,  Hexagon::VQ5,  Hexagon::VQ6,  Hexagon::VQ7};
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 5dde47a..a3296e0 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -419,8 +419,8 @@ namespace {
 
   using HCE = HexagonConstExtenders;
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const OffsetRange &OR) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const OffsetRange &OR) {
     if (OR.Min > OR.Max)
       OS << '!';
     OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align)
@@ -435,8 +435,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &P) {
     if (P.Rs.Reg != 0)
       OS << printReg(P.Rs.Reg, &P.HRI, P.Rs.Sub);
     else
@@ -451,8 +451,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintExpr &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintExpr &P) {
     OS << "## " << (P.Ex.Neg ? "- " : "+ ");
     if (P.Ex.Rs.Reg != 0)
       OS << printReg(P.Ex.Rs.Reg, &P.HRI, P.Ex.Rs.Sub);
@@ -469,15 +469,15 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintInit &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintInit &P) {
     OS << '[' << P.ExtI.first << ", "
        << PrintExpr(P.ExtI.second, P.HRI) << ']';
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtDesc &ED) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtDesc &ED) {
     assert(ED.OpNum != -1u);
     const MachineBasicBlock &MBB = *ED.getOp().getParent()->getParent();
     const MachineFunction &MF = *MBB.getParent();
@@ -493,8 +493,8 @@ namespace {
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtRoot &ER) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtRoot &ER) {
     switch (ER.Kind) {
       case MachineOperand::MO_Immediate:
         OS << "imm:" << ER.V.ImmVal;
@@ -527,8 +527,8 @@ namespace {
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtValue &EV) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtValue &EV) {
     OS << HCE::ExtRoot(EV) << "  off:" << EV.Offset;
     return OS;
   }
@@ -540,8 +540,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintIMap &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintIMap &P) {
     OS << "{\n";
     for (const std::pair<const HCE::ExtenderInit, HCE::IndexList> &Q : P.IMap) {
       OS << "  " << PrintInit(Q.first, P.HRI) << " -> {";
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 4d96cfa..c7a4f68 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -789,7 +789,7 @@ struct ShuffleMask {
   }
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
   SM.print(OS);
   return OS;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 87d052b..e4c0a16 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -364,7 +364,7 @@ private:
   const HexagonVectorCombine &HVC;
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
   OS << "Addr: " << *AI.Addr << '\n';
@@ -375,7 +375,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
   OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
@@ -394,7 +394,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS,
                         const AlignVectors::ByteSpan::Block &B) {
   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
@@ -408,7 +408,7 @@ raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
   for (const AlignVectors::ByteSpan::Block &B : BS)
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index fa8ae60..2ff5843 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -111,7 +111,7 @@ namespace {
    friend raw_ostream &operator<< (raw_ostream &OS, const DepChain &D);
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   raw_ostream &operator<<(raw_ostream &OS, const DepChain &D) {
     const ChainOfDependences &CD = D.Chain;
     int ChainSize = CD.size();
@@ -144,7 +144,7 @@ namespace {
     bool isDefined() { return Inst2Replace != nullptr; }
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   raw_ostream &operator<<(raw_ostream &OS, const ReuseValue &RU) {
     OS << "** ReuseValue ***\n";
     OS << "Instruction to Replace: " << *(RU.Inst2Replace) << "\n";
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca98269..e3094b4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -275,7 +275,7 @@ namespace HexagonII {
     INST_ICLASS_ALU32_3   = 0xf0000000
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
     switch (S) {
       case ByteAccess:        return 1;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bef4868..7e7ee75 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
 }
 
 void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
+  if (!Subtarget->hasTcgen05InstSupport())
+    report_fatal_error(
+        "tcgen05.ld is not supported on this architecture variant");
+
   SDLoc DL(N);
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
@@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
 }
 
 void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
+  if (!Subtarget->hasTcgen05InstSupport())
+    report_fatal_error(
+        "tcgen05.st is not supported on this architecture variant");
+
   SDLoc DL(N);
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6c14cf0..dfde0cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> {
 // NVPTX Instruction Predicate Definitions
 //===----------------------------------------------------------------------===//
 
+// Checks PTX version and family-specific and architecture-specific SM versions.
+// For example, sm_100{f/a} and any future variants in the same family will match
+// for any PTX version greater than or equal to `PTXVersion`.
+class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> :
+  Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" #
+            !interleave(SMVersions, ", ") # "})">;
+
+// Checks PTX version and architecture-specific SM versions.
+// For example, sm_100{a} will match for any PTX version
+// greater than or equal to `PTXVersion`.
+class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> :
+  Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" #
+            !interleave(SMVersions, ", ") # "})">;
+
+// Helper predicate to call a subtarget method.
+class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">;
 
 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a8b854f..22cf3a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in {
 def EXIT : NullaryInst<"exit", int_nvvm_exit>;
 
 // Tcgen05 intrinsics
-let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
-
+let isConvergent = true in {
+let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
 multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
   def "" : BasicNVPTXInst<(outs),
              (ins ADDR:$dst, B32:$ncols),
@@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
 defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
 defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
 
-multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
-  def "" : BasicNVPTXInst<(outs),
-             (ins ADDR:$tmem_addr),
-             "tcgen05.shift.cta_group::" # num # ".down",
-             [(Intr addr:$tmem_addr)]>;
-}
-defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
-defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
-
 multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
   defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16");
   defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret;
@@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in {
   defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">;
   defm TCGEN05_CP_32x128 # src_fmt   : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">;
 }
+} // Predicates
+
+let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in {
+multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$tmem_addr),
+             "tcgen05.shift.cta_group::" # num # ".down",
+             [(Intr addr:$tmem_addr)]>;
+}
+defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
+defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
+} // Predicates
+
 } // isConvergent
 
-let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in {
+let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
 
   def tcgen05_fence_before_thread_sync: NullaryInst<
     "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>;
@@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> {
 //
 
 class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
-        NVPTXInst<(outs), (ins), "?", []>,
-        Requires<[hasTcgen05Instructions]> {
+        NVPTXInst<(outs), (ins), "?", []> {
 
   TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
                                 NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
@@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
 //
 
 class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> :
-        NVPTXInst<(outs), (ins), "?", []>,
-        Requires<[hasTcgen05Instructions]> {
+        NVPTXInst<(outs), (ins), "?", []> {
 
   TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
                                 NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c548967..989be50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const {
   return TSInfo.get();
 }
 
+bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion,
+                                         ArrayRef<unsigned> SMVersions) const {
+  unsigned PTXVer = getPTXVersion();
+  if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion)
+    return false;
+
+  unsigned SMVer = getSmVersion();
+  return llvm::any_of(SMVersions, [&](unsigned SM) {
+    // sm_101 is a different family, never group it with sm_10x.
+    if (SMVer == 101 || SM == 101)
+      return SMVer == SM &&
+             // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+             // supported.
+             !(PTXVer >= 90 && SMVer == 101);
+
+    return getSmFamilyVersion() == SM / 10 && SMVer >= SM;
+  });
+}
+
+bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion,
+                                        ArrayRef<unsigned> SMVersions) const {
+  unsigned PTXVer = getPTXVersion();
+  if (!hasArchAccelFeatures() || PTXVer < PTXVersion)
+    return false;
+
+  unsigned SMVer = getSmVersion();
+  return llvm::any_of(SMVersions, [&](unsigned SM) {
+    return SMVer == SM &&
+           // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+           // supported.
+           !(PTXVer >= 90 && SMVer == 101);
+  });
+}
+
 bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e81c56b..194dbdc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -73,6 +73,18 @@ public:
 
   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
 
+  // Checks PTX version and family-specific and architecture-specific SM
+  // versions. For example, sm_100{f/a} and any future variants in the same
+  // family will match for any PTX version greater than or equal to
+  // `PTXVersion`.
+  bool hasPTXWithFamilySMs(unsigned PTXVersion,
+                           ArrayRef<unsigned> SMVersions) const;
+  // Checks PTX version and architecture-specific SM versions.
+  // For example, sm_100{a} will match for any PTX version greater than or equal
+  // to `PTXVersion`.
+  bool hasPTXWithAccelSMs(unsigned PTXVersion,
+                          ArrayRef<unsigned> SMVersions) const;
+
   bool has256BitVectorLoadStore(unsigned AS) const {
     return SmVersion >= 100 && PTXVersion >= 88 &&
            AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
@@ -127,6 +139,27 @@ public:
     return HasTcgen05 && PTXVersion >= MinPTXVersion;
   }
 
+  // Checks following instructions support:
+  // - tcgen05.ld/st
+  // - tcgen05.alloc/dealloc/relinquish
+  // - tcgen05.cp
+  // - tcgen05.fence/wait
+  // - tcgen05.commit
+  bool hasTcgen05InstSupport() const {
+    // sm_101 renamed to sm_110 in PTX 9.0
+    return hasPTXWithFamilySMs(90, {100, 110}) ||
+           hasPTXWithFamilySMs(88, {100, 101}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
+  }
+
+  // Checks tcgen05.shift instruction support.
+  bool hasTcgen05ShiftSupport() const {
+    // sm_101 renamed to sm_110 in PTX 9.0
+    return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
+           hasPTXWithAccelSMs(88, {100, 101, 103}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
+  }
+
   bool hasTcgen05MMAScaleInputDImm() const {
     return FullSmVersion == 1003 && PTXVersion >= 86;
   }
@@ -158,6 +191,7 @@ public:
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+  unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
   // GPUs with "a" suffix have architecture-accelerated features that are
   // supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4b54231..8851a0f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -1, (1 << 5) - 1,
         "immediate must be non-zero in the range");
+  case Match_InvalidXSfmmVType: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return generateXSfmmVTypeError(ErrorLoc);
+  }
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70b7c43..e75dfe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -142,6 +142,22 @@ enum {
 
   ReadsPastVLShift = DestEEWShift + 2,
   ReadsPastVLMask = 1ULL << ReadsPastVLShift,
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  AltFmtTypeShift = ReadsPastVLShift + 1,
+  AltFmtTypeMask = 3ULL << AltFmtTypeShift,
+
+  // XSfmmbase
+  HasTWidenOpShift = AltFmtTypeShift + 2,
+  HasTWidenOpMask = 1ULL << HasTWidenOpShift,
+
+  HasTMOpShift = HasTWidenOpShift + 1,
+  HasTMOpMask = 1ULL << HasTMOpShift,
+
+  HasTKOpShift = HasTMOpShift + 1,
+  HasTKOpMask = 1ULL << HasTKOpShift,
 };
 
 // Helper functions to read TSFlags.
@@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
   return TSFlags & HasRoundModeOpMask;
 }
 
+enum class AltFmtType { DontCare, NotAltFmt, AltFmt };
+static inline AltFmtType getAltFmtType(uint64_t TSFlags) {
+  return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift);
+}
+
 /// \returns true if this instruction uses vxrm
 static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
 
@@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) {
   return TSFlags & ReadsPastVLMask;
 }
 
+// XSfmmbase
+static inline bool hasTWidenOp(uint64_t TSFlags) {
+  return TSFlags & HasTWidenOpMask;
+}
+
+static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; }
+
+static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; }
+
+static inline unsigned getTNOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags));
+  unsigned Offset = 3;
+  if (hasTKOp(TSFlags))
+    Offset = 4;
+  return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getTMOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags));
+  if (hasTKOp(TSFlags))
+    return Desc.getNumOperands() - 5;
+  // vtzero.t
+  return Desc.getNumOperands() - 4;
+}
+
+static inline unsigned getTKOpNum(const MCInstrDesc &Desc) {
+  [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags));
+  return Desc.getNumOperands() - 3;
+}
+
 static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   // This method is only called if we expect to have a VL operand, and all
   // instructions with VL also have SEW.
   assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+  // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit.
+  if (hasTWidenOp(TSFlags))
+    return getTNOpNum(Desc);
   unsigned Offset = 2;
   if (hasVecPolicyOp(TSFlags))
     Offset = 3;
@@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   assert(hasSEWOp(TSFlags));
   unsigned Offset = 1;
-  if (hasVecPolicyOp(TSFlags))
+  if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags))
     Offset = 2;
   return Desc.getNumOperands() - Offset;
 }
@@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) {
   if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags))
     return -1;
 
+  if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags))
+    return getTMOpNum(Desc) - 1;
+
   // The operand order
   // --------------------------------------
   // | n-1 (if any)   | n-2  | n-3 | n-4 |
@@ -385,7 +445,9 @@ enum OperandType : unsigned {
   OPERAND_SEW_MASK,
   // Vector rounding mode for VXRM or FRM.
   OPERAND_VEC_RM,
-  OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM,
+  // Vtype operand for XSfmm extension.
+  OPERAND_XSFMM_VTYPE,
+  OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE,
   // Operand is either a register or uimm5, this is used by V extension pseudo
   // instructions to represent a value that be passed as AVL to either vsetvli
   // or vsetivli.
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 66ca436..de433e4 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -1100,6 +1100,12 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
     --NumOps;
   if (RISCVII::hasRoundModeOp(TSFlags))
     --NumOps;
+  if (RISCVII::hasTWidenOp(TSFlags))
+    --NumOps;
+  if (RISCVII::hasTMOp(TSFlags))
+    --NumOps;
+  if (RISCVII::hasTKOp(TSFlags))
+    --NumOps;
 
   bool hasVLOutput = RISCVInstrInfo::isFaultOnlyFirstLoad(*MI);
   for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 437022f..9a6afa1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -516,6 +516,44 @@ void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) {
               CurDAG->getMachineNode(Opcode, DL, XLenVT, VLOperand, VTypeIOp));
 }
 
+void RISCVDAGToDAGISel::selectXSfmmVSET(SDNode *Node) {
+  if (!Subtarget->hasVendorXSfmmbase())
+    return;
+
+  assert(Node->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Unexpected opcode");
+
+  SDLoc DL(Node);
+  MVT XLenVT = Subtarget->getXLenVT();
+
+  unsigned IntNo = Node->getConstantOperandVal(0);
+
+  assert((IntNo == Intrinsic::riscv_sf_vsettnt ||
+          IntNo == Intrinsic::riscv_sf_vsettm ||
+          IntNo == Intrinsic::riscv_sf_vsettk) &&
+         "Unexpected XSfmm vset intrinsic");
+
+  unsigned SEW = RISCVVType::decodeVSEW(Node->getConstantOperandVal(2));
+  unsigned Widen = RISCVVType::decodeTWiden(Node->getConstantOperandVal(3));
+  unsigned PseudoOpCode =
+      IntNo == Intrinsic::riscv_sf_vsettnt  ? RISCV::PseudoSF_VSETTNT
+      : IntNo == Intrinsic::riscv_sf_vsettm ? RISCV::PseudoSF_VSETTM
+                                            : RISCV::PseudoSF_VSETTK;
+
+  if (IntNo == Intrinsic::riscv_sf_vsettnt) {
+    unsigned VTypeI = RISCVVType::encodeXSfmmVType(SEW, Widen, 0);
+    SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+    ReplaceNode(Node, CurDAG->getMachineNode(PseudoOpCode, DL, XLenVT,
+                                             Node->getOperand(1), VTypeIOp));
+  } else {
+    SDValue Log2SEW = CurDAG->getTargetConstant(Log2_32(SEW), DL, XLenVT);
+    SDValue TWiden = CurDAG->getTargetConstant(Widen, DL, XLenVT);
+    ReplaceNode(Node,
+                CurDAG->getMachineNode(PseudoOpCode, DL, XLenVT,
+                                       Node->getOperand(1), Log2SEW, TWiden));
+  }
+}
+
 bool RISCVDAGToDAGISel::tryShrinkShlLogicImm(SDNode *Node) {
   MVT VT = Node->getSimpleValueType(0);
   unsigned Opcode = Node->getOpcode();
@@ -847,6 +885,11 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {
   return true;
 }
 
+static Register getTileReg(uint64_t TileNum) {
+  assert(TileNum <= 15 && "Invalid tile number");
+  return RISCV::T0 + TileNum;
+}
+
 void RISCVDAGToDAGISel::selectSF_VC_X_SE(SDNode *Node) {
   if (!Subtarget->hasVInstructions())
     return;
@@ -2035,6 +2078,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::riscv_vsetvli:
     case Intrinsic::riscv_vsetvlimax:
       return selectVSETVLI(Node);
+    case Intrinsic::riscv_sf_vsettnt:
+    case Intrinsic::riscv_sf_vsettm:
+    case Intrinsic::riscv_sf_vsettk:
+      return selectXSfmmVSET(Node);
     }
     break;
   }
@@ -2458,6 +2505,142 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::riscv_sf_vc_i_se:
       selectSF_VC_X_SE(Node);
       return;
+    case Intrinsic::riscv_sf_vlte8:
+    case Intrinsic::riscv_sf_vlte16:
+    case Intrinsic::riscv_sf_vlte32:
+    case Intrinsic::riscv_sf_vlte64: {
+      unsigned Log2SEW;
+      unsigned PseudoInst;
+      switch (IntNo) {
+      case Intrinsic::riscv_sf_vlte8:
+        PseudoInst = RISCV::PseudoSF_VLTE8;
+        Log2SEW = 3;
+        break;
+      case Intrinsic::riscv_sf_vlte16:
+        PseudoInst = RISCV::PseudoSF_VLTE16;
+        Log2SEW = 4;
+        break;
+      case Intrinsic::riscv_sf_vlte32:
+        PseudoInst = RISCV::PseudoSF_VLTE32;
+        Log2SEW = 5;
+        break;
+      case Intrinsic::riscv_sf_vlte64:
+        PseudoInst = RISCV::PseudoSF_VLTE64;
+        Log2SEW = 6;
+        break;
+      }
+
+      SDValue SEWOp = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
+      SDValue TWidenOp = CurDAG->getTargetConstant(1, DL, XLenVT);
+      SDValue Operands[] = {Node->getOperand(2),
+                            Node->getOperand(3),
+                            Node->getOperand(4),
+                            SEWOp,
+                            TWidenOp,
+                            Node->getOperand(0)};
+
+      MachineSDNode *TileLoad =
+          CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands);
+      if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+        CurDAG->setNodeMemRefs(TileLoad, {MemOp->getMemOperand()});
+
+      ReplaceNode(Node, TileLoad);
+      return;
+    }
+    case Intrinsic::riscv_sf_mm_s_s:
+    case Intrinsic::riscv_sf_mm_s_u:
+    case Intrinsic::riscv_sf_mm_u_s:
+    case Intrinsic::riscv_sf_mm_u_u:
+    case Intrinsic::riscv_sf_mm_e5m2_e5m2:
+    case Intrinsic::riscv_sf_mm_e5m2_e4m3:
+    case Intrinsic::riscv_sf_mm_e4m3_e5m2:
+    case Intrinsic::riscv_sf_mm_e4m3_e4m3:
+    case Intrinsic::riscv_sf_mm_f_f: {
+      bool HasFRM = false;
+      unsigned PseudoInst;
+      switch (IntNo) {
+      case Intrinsic::riscv_sf_mm_s_s:
+        PseudoInst = RISCV::PseudoSF_MM_S_S;
+        break;
+      case Intrinsic::riscv_sf_mm_s_u:
+        PseudoInst = RISCV::PseudoSF_MM_S_U;
+        break;
+      case Intrinsic::riscv_sf_mm_u_s:
+        PseudoInst = RISCV::PseudoSF_MM_U_S;
+        break;
+      case Intrinsic::riscv_sf_mm_u_u:
+        PseudoInst = RISCV::PseudoSF_MM_U_U;
+        break;
+      case Intrinsic::riscv_sf_mm_e5m2_e5m2:
+        PseudoInst = RISCV::PseudoSF_MM_E5M2_E5M2;
+        HasFRM = true;
+        break;
+      case Intrinsic::riscv_sf_mm_e5m2_e4m3:
+        PseudoInst = RISCV::PseudoSF_MM_E5M2_E4M3;
+        HasFRM = true;
+        break;
+      case Intrinsic::riscv_sf_mm_e4m3_e5m2:
+        PseudoInst = RISCV::PseudoSF_MM_E4M3_E5M2;
+        HasFRM = true;
+        break;
+      case Intrinsic::riscv_sf_mm_e4m3_e4m3:
+        PseudoInst = RISCV::PseudoSF_MM_E4M3_E4M3;
+        HasFRM = true;
+        break;
+      case Intrinsic::riscv_sf_mm_f_f:
+        if (Node->getOperand(3).getValueType().getScalarType() == MVT::bf16)
+          PseudoInst = RISCV::PseudoSF_MM_F_F_ALT;
+        else
+          PseudoInst = RISCV::PseudoSF_MM_F_F;
+        HasFRM = true;
+        break;
+      }
+      uint64_t TileNum = Node->getConstantOperandVal(2);
+      SDValue Op1 = Node->getOperand(3);
+      SDValue Op2 = Node->getOperand(4);
+      MVT VT = Op1->getSimpleValueType(0);
+      unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+      SDValue TmOp = Node->getOperand(5);
+      SDValue TnOp = Node->getOperand(6);
+      SDValue TkOp = Node->getOperand(7);
+      SDValue TWidenOp = Node->getOperand(8);
+      SDValue Chain = Node->getOperand(0);
+
+      // sf.mm.f.f with sew=32, twiden=2 is invalid
+      if (IntNo == Intrinsic::riscv_sf_mm_f_f && Log2SEW == 5 &&
+          TWidenOp->getAsZExtVal() == 2)
+        reportFatalUsageError("sf.mm.f.f doesn't support (sew=32, twiden=2)");
+
+      SmallVector<SDValue, 10> Operands(
+          {CurDAG->getRegister(getTileReg(TileNum), XLenVT), Op1, Op2});
+      if (HasFRM)
+        Operands.push_back(
+            CurDAG->getTargetConstant(RISCVFPRndMode::DYN, DL, XLenVT));
+      Operands.append({TmOp, TnOp, TkOp,
+                       CurDAG->getTargetConstant(Log2SEW, DL, XLenVT), TWidenOp,
+                       Chain});
+
+      auto *NewNode =
+          CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands);
+
+      ReplaceNode(Node, NewNode);
+      return;
+    }
+    case Intrinsic::riscv_sf_vtzero_t: {
+      uint64_t TileNum = Node->getConstantOperandVal(2);
+      SDValue Tm = Node->getOperand(3);
+      SDValue Tn = Node->getOperand(4);
+      SDValue Log2SEW = Node->getOperand(5);
+      SDValue TWiden = Node->getOperand(6);
+      SDValue Chain = Node->getOperand(0);
+      auto *NewNode = CurDAG->getMachineNode(
+          RISCV::PseudoSF_VTZERO_T, DL, Node->getVTList(),
+          {CurDAG->getRegister(getTileReg(TileNum), XLenVT), Tm, Tn, Log2SEW,
+           TWiden, Chain});
+
+      ReplaceNode(Node, NewNode);
+      return;
+    }
     }
     break;
   }
@@ -3353,14 +3536,20 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal,
           0);
       return true;
     }
-    // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the
-    // LHS is equal to the RHS and non-zero otherwise.
+    // If the RHS is [-2047,2048], we can use addi/addiw with -RHS to produce 0
+    // if the LHS is equal to the RHS and non-zero otherwise.
     if (isInt<12>(CVal) || CVal == 2048) {
-      Val = SDValue(
-          CurDAG->getMachineNode(
-              RISCV::ADDI, DL, N->getValueType(0), LHS,
-              CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))),
-          0);
+      unsigned Opc = RISCV::ADDI;
+      if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+          cast<VTSDNode>(LHS.getOperand(1))->getVT() == MVT::i32) {
+        Opc = RISCV::ADDIW;
+        LHS = LHS.getOperand(0);
+      }
+
+      Val = SDValue(CurDAG->getMachineNode(Opc, DL, N->getValueType(0), LHS,
+                                           CurDAG->getSignedTargetConstant(
+                                               -CVal, DL, N->getValueType(0))),
+                    0);
       return true;
     }
     if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index f03b44c..19ee103 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -165,6 +165,7 @@ public:
   void selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, bool IsOrdered);
 
   void selectVSETVLI(SDNode *Node);
+  void selectXSfmmVSET(SDNode *Node);
 
   void selectSF_VC_X_SE(SDNode *Node);
 
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cf8d120..1b7cb9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -168,10 +168,13 @@ struct DemandedFields {
   // If this is true, we demand that VTYPE is set to some legal state, i.e. that
   // vill is unset.
   bool VILL = false;
+  bool TWiden = false;
+  bool AltFmt = false;
 
   // Return true if any part of VTYPE was used
   bool usedVTYPE() const {
-    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL;
+    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL ||
+           TWiden || AltFmt;
   }
 
   // Return true if any property of VL was used
@@ -187,6 +190,8 @@ struct DemandedFields {
     TailPolicy = true;
     MaskPolicy = true;
     VILL = true;
+    TWiden = true;
+    AltFmt = true;
   }
 
   // Mark all VL properties as demanded
@@ -212,6 +217,8 @@ struct DemandedFields {
     TailPolicy |= B.TailPolicy;
     MaskPolicy |= B.MaskPolicy;
     VILL |= B.VILL;
+    AltFmt |= B.AltFmt;
+    TWiden |= B.TWiden;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -258,7 +265,9 @@ struct DemandedFields {
     OS << "SEWLMULRatio=" << SEWLMULRatio << ", ";
     OS << "TailPolicy=" << TailPolicy << ", ";
     OS << "MaskPolicy=" << MaskPolicy << ", ";
-    OS << "VILL=" << VILL;
+    OS << "VILL=" << VILL << ", ";
+    OS << "AltFmt=" << AltFmt << ", ";
+    OS << "TWiden=" << TWiden;
     OS << "}";
   }
 #endif
@@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
   if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) !=
                              RISCVVType::isMaskAgnostic(NewVType))
     return false;
+  if (Used.TWiden && (RISCVVType::hasXSfmmWiden(CurVType) !=
+                          RISCVVType::hasXSfmmWiden(NewVType) ||
+                      (RISCVVType::hasXSfmmWiden(CurVType) &&
+                       RISCVVType::getXSfmmWiden(CurVType) !=
+                           RISCVVType::getXSfmmWiden(NewVType))))
+    return false;
+  if (Used.AltFmt &&
+      RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType))
+    return false;
   return true;
 }
 
@@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     Res.TailPolicy = false;
   }
 
+  Res.AltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) !=
+               RISCVII::AltFmtType::DontCare;
+  Res.TWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+               RISCVInstrInfo::isXSfmmVectorConfigInstr(MI);
+
   return Res;
 }
 
@@ -510,6 +533,8 @@ class VSETVLIInfo {
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
   uint8_t SEWLMULRatioOnly : 1;
+  uint8_t AltFmt : 1;
+  uint8_t TWiden : 3;
 
 public:
   VSETVLIInfo()
@@ -586,6 +611,8 @@ public:
   RISCVVType::VLMUL getVLMUL() const { return VLMul; }
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
+  bool getAltFmt() const { return AltFmt; }
+  unsigned getTWiden() const { return TWiden; }
 
   bool hasNonZeroAVL(const LiveIntervals *LIS) const {
     if (hasAVLImm())
@@ -647,21 +674,31 @@ public:
     SEW = RISCVVType::getSEW(VType);
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+    AltFmt = RISCVVType::isAltFmt(VType);
+    TWiden =
+        RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0;
   }
-  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
+  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt,
+                unsigned W) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
     SEW = S;
     TailAgnostic = TA;
     MaskAgnostic = MA;
+    AltFmt = Altfmt;
+    TWiden = W;
   }
 
+  void setAltFmt(bool AF) { AltFmt = AF; }
+
   void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
 
   unsigned encodeVTYPE() const {
     assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
            "Can't encode VTYPE for uninitialized or unknown");
+    if (TWiden != 0)
+      return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
     return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
   }
 
@@ -674,9 +711,9 @@ public:
            "Can't compare VTYPE in unknown state");
     assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
            "Can't compare when only LMUL/SEW ratio is valid.");
-    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) ==
            std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
-                    Other.MaskAgnostic);
+                    Other.MaskAgnostic, Other.AltFmt, Other.TWiden);
   }
 
   unsigned getSEWLMULRatio() const {
@@ -825,7 +862,9 @@ public:
        << "SEW=e" << (unsigned)SEW << ", "
        << "TailAgnostic=" << (bool)TailAgnostic << ", "
        << "MaskAgnostic=" << (bool)MaskAgnostic << ", "
-       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
+       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", "
+       << "TWiden=" << (unsigned)TWiden << ", "
+       << "AltFmt=" << (bool)AltFmt << "}";
   }
 #endif
 };
@@ -853,6 +892,11 @@ struct BlockData {
   BlockData() = default;
 };
 
+enum TKTMMode {
+  VSETTK = 0,
+  VSETTM = 1,
+};
+
 class RISCVInsertVSETVLI : public MachineFunctionPass {
   const RISCVSubtarget *ST;
   const TargetInstrInfo *TII;
@@ -908,6 +952,7 @@ private:
   VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
   VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
   void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+  bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const;
 };
 
 } // end anonymous namespace
@@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
+  } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) {
+    assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT ||
+           MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0);
+    switch (MI.getOpcode()) {
+    case RISCV::PseudoSF_VSETTNTX0:
+      NewInfo.setAVLVLMAX();
+      break;
+    case RISCV::PseudoSF_VSETTNT:
+      Register ATNReg = MI.getOperand(1).getReg();
+      NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg);
+      break;
+    }
   } else {
     assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
            MI.getOpcode() == RISCV::PseudoVSETVLIX0);
@@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
   RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
 
+  bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt;
+  InstrInfo.setAltFmt(AltFmt);
+
   unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
   // A Log2SEW of 0 is an operation on mask registers only.
   unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
   assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
 
+  if (RISCVII::hasTWidenOp(TSFlags)) {
+    const MachineOperand &TWidenOp =
+        MI.getOperand(MI.getNumExplicitOperands() - 1);
+    unsigned TWiden = TWidenOp.getImm();
+
+    InstrInfo.setAVLVLMAX();
+    if (RISCVII::hasVLOp(TSFlags)) {
+      const MachineOperand &TNOp =
+          MI.getOperand(RISCVII::getTNOpNum(MI.getDesc()));
+
+      if (TNOp.getReg().isVirtual())
+        InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS),
+                               TNOp.getReg());
+    }
+
+    InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden);
+
+    return InstrInfo;
+  }
+
   if (RISCVII::hasVLOp(TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
     if (VLOp.isImm()) {
@@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
     assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
   }
 #endif
-  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+  // TODO: Propagate the twiden from previous vtype for potential reuse.
+  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt,
+                     /*TWiden*/ 0);
 
   forwardVSETVLIAVL(InstrInfo);
 
@@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 }
 
 void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator InsertPt, DebugLoc DL,
-                     const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
-
+                                       MachineBasicBlock::iterator InsertPt,
+                                       DebugLoc DL, const VSETVLIInfo &Info,
+                                       const VSETVLIInfo &PrevInfo) {
   ++NumInsertedVSETVL;
+
+  if (Info.getTWiden()) {
+    if (Info.hasAVLVLMAX()) {
+      Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0))
+                    .addReg(DestReg, RegState::Define | RegState::Dead)
+                    .addReg(RISCV::X0, RegState::Kill)
+                    .addImm(Info.encodeVTYPE());
+      if (LIS) {
+        LIS->InsertMachineInstrInMaps(*MI);
+        LIS->createAndComputeVirtRegInterval(DestReg);
+      }
+    } else {
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT))
+                    .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                    .addReg(Info.getAVLReg())
+                    .addImm(Info.encodeVTYPE());
+      if (LIS)
+        LIS->InsertMachineInstrInMaps(*MI);
+    }
+    return;
+  }
+
   if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
     // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
     // VLMAX.
@@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
     // be coalesced into another vsetvli since we won't demand any fields.
     VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
     NewInfo.setAVLImm(1);
-    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
+    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true,
+                     /*AltFmt*/ false, /*W*/ 0);
     Info = NewInfo;
     return;
   }
@@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
       (Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() ||
           IncomingInfo.getTailAgnostic(),
       (Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() ||
-          IncomingInfo.getMaskAgnostic());
+          IncomingInfo.getMaskAgnostic(),
+      (Demanded.AltFmt ? IncomingInfo : Info).getAltFmt(),
+      Demanded.TWiden ? IncomingInfo.getTWiden() : 0);
 
   // If we only knew the sew/lmul ratio previously, replace the VTYPE but keep
   // the AVL.
@@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB,
 
     if (RISCVInstrInfo::isVectorConfigInstr(MI) ||
         RISCVII::hasSEWOp(MI.getDesc().TSFlags) ||
-        isVectorCopy(ST->getRegisterInfo(), MI))
+        isVectorCopy(ST->getRegisterInfo(), MI) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI))
       HadVectorOp = true;
 
     transferAfter(Info, MI);
@@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   };
 
   for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    // TODO: Support XSfmm.
+    if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) {
+      NextMI = nullptr;
+      continue;
+    }
 
     if (!RISCVInstrInfo::isVectorConfigInstr(MI)) {
       Used.doUnion(getDemanded(MI, ST));
@@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
   }
 }
 
+bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB,
+                                       TKTMMode Mode) const {
+
+  bool Changed = false;
+  for (auto &MI : MBB) {
+    uint64_t TSFlags = MI.getDesc().TSFlags;
+    if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) ||
+        !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags))
+      continue;
+
+    VSETVLIInfo CurrInfo = computeInfoForInstr(MI);
+
+    if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags))
+      continue;
+
+    if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags))
+      continue;
+
+    unsigned OpNum = 0;
+    unsigned Opcode = 0;
+    switch (Mode) {
+    case VSETTK:
+      OpNum = RISCVII::getTKOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTK;
+      break;
+    case VSETTM:
+      OpNum = RISCVII::getTMOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTM;
+      break;
+    }
+
+    assert(OpNum && Opcode && "Invalid OpNum or Opcode");
+
+    MachineOperand &Op = MI.getOperand(OpNum);
+
+    auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode))
+                     .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                     .addReg(Op.getReg())
+                     .addImm(Log2_32(CurrInfo.getSEW()))
+                     .addImm(Log2_32(CurrInfo.getTWiden()) + 1);
+
+    Changed = true;
+    Register Reg = Op.getReg();
+    Op.setReg(Register());
+    Op.setIsKill(false);
+    if (LIS) {
+      LIS->InsertMachineInstrInMaps(*TmpMI);
+      LiveInterval &LI = LIS->getInterval(Reg);
+
+      // Erase the AVL operand from the instruction.
+      LIS->shrinkToUses(&LI);
+      // TODO: Enable this once needVSETVLIPHI is supported.
+      // SmallVector<LiveInterval *> SplitLIs;
+      // LIS->splitSeparateComponents(LI, SplitLIs);
+    }
+  }
+  return Changed;
+}
+
 bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   // Skip if the vector extension is not enabled.
   ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     insertReadVL(MBB);
 
+  for (MachineBasicBlock &MBB : MF) {
+    insertVSETMTK(MBB, VSETTM);
+    insertVSETMTK(MBB, VSETTK);
+  }
+
   BlockInfo.clear();
   return HaveVectorOp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 2afd77a..fee1d15 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -177,7 +177,7 @@ def EltDepsVL        : EltDeps<vl=1, mask=0>;
 def EltDepsMask      : EltDeps<vl=0, mask=1>;
 def EltDepsVLMask    : EltDeps<vl=1, mask=1>;
 
-class EEW <bits<2> val> {
+class EEW<bits<2> val> {
   bits<2> Value = val;
 }
 def EEW1     : EEW<0>;
@@ -185,6 +185,13 @@ def EEWSEWx1 : EEW<1>;
 def EEWSEWx2 : EEW<2>;
 def EEWSEWx4 : EEW<3>;
 
+class AltFmtType<bits<2> val> {
+  bits<2> Value = val;
+}
+def DONT_CARE_ALTFMT : AltFmtType<0>;
+def IS_NOT_ALTFMT    : AltFmtType<1>;
+def IS_ALTFMT        : AltFmtType<2>;
+
 class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
                    list<dag> pattern, InstFormat format> : Instruction {
   let Namespace = "RISCV";
@@ -267,6 +274,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // operands' VLs.
   bit ReadsPastVL = 0;
   let TSFlags{26} = ReadsPastVL;
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  AltFmtType AltFmtType = DONT_CARE_ALTFMT;
+  let TSFlags{28-27} = AltFmtType.Value;
+
+  // XSfmmbase
+  bit HasTWidenOp = 0;
+  let TSFlags{29} = HasTWidenOp;
+
+  bit HasTmOp = 0;
+  let TSFlags{30} = HasTmOp;
+
+  bit HasTkOp = 0;
+  let TSFlags{31} = HasTkOp;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 96e1078..ddb53a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           else
             Ok = RISCVFPRndMode::isValidRoundingMode(Imm);
           break;
+        case RISCVOp::OPERAND_XSFMM_VTYPE:
+          Ok = RISCVVType::isValidXSfmmVType(Imm);
+          break;
         }
         if (!Ok) {
           ErrInfo = "Invalid immediate";
@@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment(
     RISCVVType::printVType(Imm, OS);
     break;
   }
+  case RISCVOp::OPERAND_XSFMM_VTYPE: {
+    unsigned Imm = Op.getImm();
+    RISCVVType::printXSfmmVType(Imm, OS);
+    break;
+  }
   case RISCVOp::OPERAND_SEW:
   case RISCVOp::OPERAND_SEW_MASK: {
     unsigned Log2SEW = Op.getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 298d35a..65865ce 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -159,7 +159,8 @@ class PseudoToVInst<string PseudoInst> {
                         ["_M4", ""],
                         ["_M8", ""],
                         ["_SE", ""],
-                        ["_RM", ""]
+                        ["_RM", ""],
+                        ["_ALT", ""]
                        ];
   string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
                         !subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -6396,7 +6397,7 @@ let Defs = [VXSAT] in {
 // 13. Vector Floating-Point Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
@@ -6565,7 +6566,7 @@ defm PseudoVFNCVT_F_F      : VPseudoVNCVTD_W_RM;
 
 defm PseudoVFNCVT_ROD_F_F  : VPseudoVNCVTD_W;
 } // mayRaiseFPException = true
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 14. Vector Reduction Operations
@@ -6593,7 +6594,7 @@ defm PseudoVWREDSUM    : VPseudoVWRED_VS;
 }
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 14.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
@@ -6612,7 +6613,7 @@ defm PseudoVFWREDUSUM  : VPseudoVFWRED_VS_RM;
 defm PseudoVFWREDOSUM  : VPseudoVFWREDO_VS_RM;
 }
 
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 15. Vector Mask Instructions
@@ -6703,7 +6704,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // 16.2. Floating-Point Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach f = FPList in {
     let HasSEWOp = 1, BaseInstr = VFMV_F_S in
@@ -6718,7 +6719,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
       Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
   }
 }
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.3. Vector Slide Instructions
@@ -6730,10 +6731,10 @@ let Predicates = [HasVInstructions] in {
   defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
   defm PseudoVFSLIDE1UP  : VPseudoVSLD1_VF<"@earlyclobber $rd">;
   defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.4. Vector Register Gather Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 557d873..6a4119a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPList in {
     foreach m = f.MxList in {
-    defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
-    defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in {
+        defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
+        defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      }
     }
   }
   foreach m = MxListW in {
@@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPListW in {
     foreach m = f.MxList in
-    defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in
+        defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index a5ee701..d77a44a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in {
   def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001,
                                   "sf.vsettm", "$rd, $rs1">;
   def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010,
-                                   "sf.vsettk", "$rd, $rs1">;
+                                  "sf.vsettk", "$rd, $rs1">;
   def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">;
 
   def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1),
@@ -277,3 +277,195 @@ let Uses = [FRM], mayRaiseFPException = true in {
 } // Predicates = [HasVendorXSfmm32a8f]
 
 } // DecoderNamespace = "XSfvector"
+
+class VPseudoSF_VTileLoad
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileStore
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_V_T
+    : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                         ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_T_V
+    : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn,
+                        AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul_FRM<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm,
+                        AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew,
+                        ixlenimm:$twiden), []> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasRoundModeOp = 1;
+  let hasPostISelHook = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [VL, VTYPE] in {
+  def PseudoSF_VSETTNT
+      : Pseudo<(outs GPR:$rd),
+               (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0
+      : Pseudo<(outs GPRNoX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0X0
+      : Pseudo<(outs GPRX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+
+let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in {
+  def PseudoSF_VSETTM
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTK
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+}
+
+foreach eew = [8, 16, 32, 64] in {
+  def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad;
+  def PseudoSF_VSTE # eew : VPseudoSF_VTileStore;
+}
+
+def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V;
+def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T;
+
+foreach a = I8Encodes in
+  foreach b = I8Encodes in
+    def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name)
+        : VPseudoSF_MatMul<TRM4>;
+
+let AltFmtType = IS_NOT_ALTFMT in
+  def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>;
+let AltFmtType = IS_ALTFMT in
+  def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>;
+
+foreach e1 = [5, 4] in
+  foreach e2 = [5, 4] in
+    def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2)
+        : VPseudoSF_MatMul_FRM<TRM4>;
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+  let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in
+    def PseudoSF_VTZERO_T
+        : RISCVVPseudo<(outs),
+                       (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew,
+                            ixlenimm:$twiden)>;
+  def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>;
+}
+
+class VPatXSfmmTileStore<string intrinsic_name,
+                         string inst_name,
+                         int log2sew> :
+  Pat<(!cast<Intrinsic>(intrinsic_name)
+       (XLenVT GPR:$rs2),
+       (XLenVT GPR:$rs1),
+       (XLenVT AVL:$tn)),
+      (!cast<Instruction>(inst_name)
+       (XLenVT GPR:$rs2),
+       (XLenVT GPR:$rs1),
+       GPR:$tn, log2sew, 1)>;
+
+class VPatXSfmmTileMove_T_V<string intrinsic_name,
+                            string inst_name,
+                            ValueType reg_type,
+                            int log2sew> :
+  Pat<(!cast<Intrinsic>(intrinsic_name)
+                    (XLenVT GPR:$rs1),
+                    (reg_type VRM8:$vs2),
+                    (XLenVT AVL:$atn)),
+      (!cast<Instruction>(inst_name)
+       (XLenVT GPR:$rs1),
+       (reg_type VRM8:$vs2),
+       GPR:$atn, log2sew, 1)>;
+
+class VPatXSfmmTileMove_V_T<string intrinsic_name,
+                            string inst_name,
+                            ValueType result_type,
+                            int log2sew> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                    (XLenVT GPR:$rs1),
+                    (XLenVT AVL:$atn))),
+      (!cast<Instruction>(inst_name)
+       (XLenVT GPR:$rs1),
+       GPR:$atn, log2sew, 1)>;
+
+class VPatXSfmmVTDiscard<string intrinsic_name,
+                         string inst_name> :
+  Pat<(!cast<Intrinsic>(intrinsic_name)),
+      (!cast<Instruction>(inst_name))>;
+
+foreach eew = [8, 16, 32, 64] in
+  def : VPatXSfmmTileStore<"int_riscv_sf_vste" # eew, "PseudoSF_VSTE" # eew, !logtwo(eew)>;
+
+foreach vti = [VI8M8, VI16M8, VI32M8, VI64M8, VF16M8, VF32M8, VF64M8, VBF16M8] in {
+  def : VPatXSfmmTileMove_T_V<"int_riscv_sf_vtmv_t_v", "PseudoSF_VTMV_T_V", vti.Vector, vti.Log2SEW>;
+  def : VPatXSfmmTileMove_V_T<"int_riscv_sf_vtmv_v_t", "PseudoSF_VTMV_V_T", vti.Vector, vti.Log2SEW>;
+}
+
+def : VPatXSfmmVTDiscard<"int_riscv_sf_vtdiscard", "PseudoSF_VTDISCARD">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 3658817..dcae977 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -78,7 +78,41 @@ def isVectorConfigInstr
                        PseudoVSETVLI,
                        PseudoVSETVLIX0,
                        PseudoVSETVLIX0X0,
-                       PseudoVSETIVLI
+                       PseudoVSETIVLI,
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is a PseudoSF_VSETTNT* instructions.
+def isXSfmmVectorConfigTNInstr
+    : TIIPredicate<"isXSfmmVectorConfigTNInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK.
+def isXSfmmVectorConfigTMTKInstr
+    : TIIPredicate<"isXSfmmVectorConfigTMTKInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
+                     ]>>>;
+
+// Returns true if this is a XSfmm vector configuration instruction.
+def isXSfmmVectorConfigInstr
+    : TIIPredicate<"isXSfmmVectorConfigInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0,
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
                      ]>>>;
 
 // Return true if this is 'vsetvli x0, x0, vtype' which preserves
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 40b6416..e9f43b9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Shadow stack pointer.
   markSuperRegs(Reserved, RISCV::SSP);
 
+  // XSfmmbase
+  for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++)
+    markSuperRegs(Reserved, Reg);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 96ad5c6..0a8838c 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -156,13 +156,13 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
   return new RISCVVLOptimizer();
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
   OI.print(OS);
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static raw_ostream &operator<<(raw_ostream &OS,
                                const std::optional<OperandInfo> &OI) {
   if (OI)
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index b765fec..56a6168 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -139,8 +139,8 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
 // anymore.
 void SPIRVAsmPrinter::cleanUp(Module &M) {
   // Verifier disallows uses of intrinsic global variables.
-  for (StringRef GVName : {"llvm.global_ctors", "llvm.global_dtors",
-                           "llvm.used", "llvm.compiler.used"}) {
+  for (StringRef GVName :
+       {"llvm.global_ctors", "llvm.global_dtors", "llvm.used"}) {
     if (GlobalVariable *GV = M.getNamedGlobal(GVName))
       GV->setName("");
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e16c8f0..c6c6182 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -17,6 +17,7 @@
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
@@ -2028,9 +2029,13 @@ Instruction *SPIRVEmitIntrinsics::visitUnreachableInst(UnreachableInst &I) {
 
 void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
                                              IRBuilder<> &B) {
-  // Skip special artifical variable llvm.global.annotations.
-  if (GV.getName() == "llvm.global.annotations")
+  // Skip special artificial variables.
+  static const StringSet<> ArtificialGlobals{"llvm.global.annotations",
+                                             "llvm.compiler.used"};
+
+  if (ArtificialGlobals.contains(GV.getName()))
     return;
+
   Constant *Init = nullptr;
   if (hasInitializer(&GV)) {
     // Deduce element type and store results in Global Registry.
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index a0cf881..5a06ea3 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -24,6 +24,7 @@ class SystemZTargetMachine;
 
 namespace SystemZ {
 // Condition-code mask values.
+const unsigned CCMASK_NONE = 0;
 const unsigned CCMASK_0 = 1 << 3;
 const unsigned CCMASK_1 = 1 << 2;
 const unsigned CCMASK_2 = 1 << 1;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3b7d11a..de28faf 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "SystemZConstantPoolValue.h"
 #include "SystemZMachineFunctionInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -1514,6 +1516,9 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     }
+  } else if (Constraint.size() == 5 && Constraint.starts_with("{")) {
+    if (StringRef("{@cc}").compare(Constraint) == 0)
+      return C_Other;
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -1707,6 +1712,10 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
                                  SystemZMC::VR128Regs, 32);
     }
+    if (Constraint[1] == '@') {
+      if (StringRef("{@cc}").compare(Constraint) == 0)
+        return std::make_pair(0u, &SystemZ::GR32BitRegClass);
+    }
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
@@ -1737,6 +1746,38 @@ Register SystemZTargetLowering::getExceptionSelectorRegister(
   return Subtarget.isTargetXPLINK64() ? SystemZ::R2D : SystemZ::R7D;
 }
 
+// Convert condition code in CCReg to an i32 value.
+static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
+  SDLoc DL(CCReg);
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
+  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+}
+
+// Lower @cc targets via setcc.
+SDValue SystemZTargetLowering::LowerAsmOutputForConstraint(
+    SDValue &Chain, SDValue &Glue, const SDLoc &DL,
+    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+  if (StringRef("{@cc}").compare(OpInfo.ConstraintCode) != 0)
+    return SDValue();
+
+  // Check that return type is valid.
+  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+      OpInfo.ConstraintVT.getSizeInBits() < 8)
+    report_fatal_error("Glue output operand is of invalid type");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(SystemZ::CC);
+
+  if (Glue.getNode()) {
+    Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue);
+    Chain = Glue.getValue(1);
+  } else
+    Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32);
+  return getCCResult(DAG, Glue);
+}
+
 void SystemZTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -5300,14 +5341,6 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
                                  Node->getMemoryVT(), Node->getMemOperand());
 }
 
-// Convert condition code in CCReg to an i32 value.
-static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
-  SDLoc DL(CCReg);
-  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
-  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
-                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
-}
-
 SDValue
 SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                               SelectionDAG &DAG) const {
@@ -8723,95 +8756,247 @@ SDValue SystemZTargetLowering::combineSETCC(
   return SDValue();
 }
 
-static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
+static std::pair<SDValue, int> findCCUse(const SDValue &Val) {
+  switch (Val.getOpcode()) {
+  default:
+    return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+  case SystemZISD::IPM:
+    if (Val.getOperand(0).getOpcode() == SystemZISD::CLC ||
+        Val.getOperand(0).getOpcode() == SystemZISD::STRCMP)
+      return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ICMP);
+    return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ANY);
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue Op4CCReg = Val.getOperand(4);
+    if (Op4CCReg.getOpcode() == SystemZISD::ICMP ||
+        Op4CCReg.getOpcode() == SystemZISD::TM) {
+      auto [OpCC, OpCCValid] = findCCUse(Op4CCReg.getOperand(0));
+      if (OpCC != SDValue())
+        return std::make_pair(OpCC, OpCCValid);
+    }
+    auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+    if (!CCValid)
+      return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+    int CCValidVal = CCValid->getZExtValue();
+    return std::make_pair(Op4CCReg, CCValidVal);
+  }
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    auto [Op0CC, Op0CCValid] = findCCUse(Val.getOperand(0));
+    if (Op0CC != SDValue())
+      return std::make_pair(Op0CC, Op0CCValid);
+    return findCCUse(Val.getOperand(1));
+  }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+                          SelectionDAG &DAG);
+
+SmallVector<SDValue, 4> static simplifyAssumingCCVal(SDValue &Val, SDValue &CC,
+                                                     SelectionDAG &DAG) {
+  SDLoc DL(Val);
+  auto Opcode = Val.getOpcode();
+  switch (Opcode) {
+  default:
+    return {};
+  case ISD::Constant:
+    return {Val, Val, Val, Val};
+  case SystemZISD::IPM: {
+    SDValue IPMOp0 = Val.getOperand(0);
+    if (IPMOp0 != CC)
+      return {};
+    SmallVector<SDValue, 4> ShiftedCCVals;
+    for (auto CC : {0, 1, 2, 3})
+      ShiftedCCVals.emplace_back(
+          DAG.getConstant((CC << SystemZ::IPM_CC), DL, MVT::i32));
+    return ShiftedCCVals;
+  }
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+    auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+    auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+    if (!CCValid || !CCMask)
+      return {};
+
+    int CCValidVal = CCValid->getZExtValue();
+    int CCMaskVal = CCMask->getZExtValue();
+    const auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CC, DAG);
+    const auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CC, DAG);
+    if (TrueSDVals.empty() || FalseSDVals.empty())
+      return {};
+    SDValue Op4CCReg = Val.getOperand(4);
+    if (Op4CCReg != CC)
+      combineCCMask(Op4CCReg, CCValidVal, CCMaskVal, DAG);
+    if (Op4CCReg != CC)
+      return {};
+    SmallVector<SDValue, 4> MergedSDVals;
+    for (auto &CCVal : {0, 1, 2, 3})
+      MergedSDVals.emplace_back(((CCMaskVal & (1 << (3 - CCVal))) != 0)
+                                    ? TrueSDVals[CCVal]
+                                    : FalseSDVals[CCVal]);
+    return MergedSDVals;
+  }
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SRA:
+    // Avoid introducing CC spills (because ADD/AND/OR/XOR/SRA
+    // would clobber CC).
+    if (!Val.hasOneUse())
+      return {};
+    [[fallthrough]];
+  case ISD::SHL:
+  case ISD::SRL:
+    SDValue Op0 = Val.getOperand(0), Op1 = Val.getOperand(1);
+    const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
+      return {};
+    SmallVector<SDValue, 4> BinaryOpSDVals;
+    for (auto CCVal : {0, 1, 2, 3})
+      BinaryOpSDVals.emplace_back(DAG.getNode(
+          Opcode, DL, Val.getValueType(), Op0SDVals[CCVal], Op1SDVals[CCVal]));
+    return BinaryOpSDVals;
+  }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+                          SelectionDAG &DAG) {
   // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
   // set by the CCReg instruction using the CCValid / CCMask masks,
-  // If the CCReg instruction is itself a ICMP testing the condition
+  // If the CCReg instruction is itself a ICMP / TM  testing the condition
   // code set by some other instruction, see whether we can directly
   // use that condition code.
-
-  // Verify that we have an ICMP against some constant.
-  if (CCValid != SystemZ::CCMASK_ICMP)
-    return false;
-  auto *ICmp = CCReg.getNode();
-  if (ICmp->getOpcode() != SystemZISD::ICMP)
-    return false;
-  auto *CompareLHS = ICmp->getOperand(0).getNode();
-  auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
-  if (!CompareRHS)
+  auto *CCNode = CCReg.getNode();
+  if (!CCNode)
     return false;
 
-  // Optimize the case where CompareLHS is a SELECT_CCMASK.
-  if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
-    // Verify that we have an appropriate mask for a EQ or NE comparison.
-    bool Invert = false;
-    if (CCMask == SystemZ::CCMASK_CMP_NE)
-      Invert = !Invert;
-    else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+  if (CCNode->getOpcode() == SystemZISD::TM) {
+    if (CCValid != SystemZ::CCMASK_TM)
       return false;
-
-    // Verify that the ICMP compares against one of select values.
-    auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
-    if (!TrueVal)
-      return false;
-    auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
-    if (!FalseVal)
+    auto emulateTMCCMask = [](const SDValue &Op0Val, const SDValue &Op1Val) {
+      auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+      auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+      if (!Op0Node || !Op1Node)
+        return -1;
+      auto Op0APVal = Op0Node->getAPIntValue();
+      auto Op1APVal = Op1Node->getAPIntValue();
+      auto Result = Op0APVal & Op1APVal;
+      bool AllOnes = Result == Op1APVal;
+      bool AllZeros = Result == 0;
+      bool IsLeftMostBitSet = Result[Op1APVal.getActiveBits()] != 0;
+      return AllZeros ? 0 : AllOnes ? 3 : IsLeftMostBitSet ? 2 : 1;
+    };
+    SDValue Op0 = CCNode->getOperand(0);
+    SDValue Op1 = CCNode->getOperand(1);
+    auto [Op0CC, Op0CCValid] = findCCUse(Op0);
+    if (Op0CC == SDValue())
       return false;
-    if (CompareRHS->getAPIntValue() == FalseVal->getAPIntValue())
-      Invert = !Invert;
-    else if (CompareRHS->getAPIntValue() != TrueVal->getAPIntValue())
+    const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, Op0CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, Op0CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
       return false;
-
-    // Compute the effective CC mask for the new branch or select.
-    auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
-    auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
-    if (!NewCCValid || !NewCCMask)
-      return false;
-    CCValid = NewCCValid->getZExtValue();
-    CCMask = NewCCMask->getZExtValue();
-    if (Invert)
-      CCMask ^= CCValid;
-
-    // Return the updated CCReg link.
-    CCReg = CompareLHS->getOperand(4);
+    int NewCCMask = 0;
+    for (auto CC : {0, 1, 2, 3}) {
+      auto CCVal = emulateTMCCMask(Op0SDVals[CC], Op1SDVals[CC]);
+      if (CCVal < 0)
+        return false;
+      NewCCMask <<= 1;
+      NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+    }
+    NewCCMask &= Op0CCValid;
+    CCReg = Op0CC;
+    CCMask = NewCCMask;
+    CCValid = Op0CCValid;
     return true;
   }
+  if (CCNode->getOpcode() != SystemZISD::ICMP ||
+      CCValid != SystemZ::CCMASK_ICMP)
+    return false;
 
-  // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
-  if (CompareLHS->getOpcode() == ISD::SRA) {
-    auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
-    if (!SRACount || SRACount->getZExtValue() != 30)
-      return false;
-    auto *SHL = CompareLHS->getOperand(0).getNode();
-    if (SHL->getOpcode() != ISD::SHL)
-      return false;
-    auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
-    if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
-      return false;
-    auto *IPM = SHL->getOperand(0).getNode();
-    if (IPM->getOpcode() != SystemZISD::IPM)
-      return false;
-
-    // Avoid introducing CC spills (because SRA would clobber CC).
-    if (!CompareLHS->hasOneUse())
-      return false;
-    // Verify that the ICMP compares against zero.
-    if (CompareRHS->getZExtValue() != 0)
+  SDValue CmpOp0 = CCNode->getOperand(0);
+  SDValue CmpOp1 = CCNode->getOperand(1);
+  SDValue CmpOp2 = CCNode->getOperand(2);
+  auto [Op0CC, Op0CCValid] = findCCUse(CmpOp0);
+  if (Op0CC != SDValue()) {
+    const auto &&Op0SDVals = simplifyAssumingCCVal(CmpOp0, Op0CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(CmpOp1, Op0CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
       return false;
 
-    // Compute the effective CC mask for the new branch or select.
-    CCMask = SystemZ::reverseCCMask(CCMask);
-
-    // Return the updated CCReg link.
-    CCReg = IPM->getOperand(0);
+    auto *CmpType = dyn_cast<ConstantSDNode>(CmpOp2);
+    auto CmpTypeVal = CmpType->getZExtValue();
+    const auto compareCCSigned = [&CmpTypeVal](const SDValue &Op0Val,
+                                               const SDValue &Op1Val) {
+      auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+      auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+      if (!Op0Node || !Op1Node)
+        return -1;
+      auto Op0APVal = Op0Node->getAPIntValue();
+      auto Op1APVal = Op1Node->getAPIntValue();
+      if (CmpTypeVal == SystemZICMP::SignedOnly)
+        return Op0APVal == Op1APVal ? 0 : Op0APVal.slt(Op1APVal) ? 1 : 2;
+      return Op0APVal == Op1APVal ? 0 : Op0APVal.ult(Op1APVal) ? 1 : 2;
+    };
+    int NewCCMask = 0;
+    for (auto CC : {0, 1, 2, 3}) {
+      auto CCVal = compareCCSigned(Op0SDVals[CC], Op1SDVals[CC]);
+      if (CCVal < 0)
+        return false;
+      NewCCMask <<= 1;
+      NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+    }
+    NewCCMask &= Op0CCValid;
+    CCMask = NewCCMask;
+    CCReg = Op0CC;
+    CCValid = Op0CCValid;
     return true;
   }
 
   return false;
 }
 
-SDValue SystemZTargetLowering::combineBR_CCMASK(
-    SDNode *N, DAGCombinerInfo &DCI) const {
+// Merging versus split in multiple branches cost.
+TargetLoweringBase::CondMergingParams
+SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
+                                                     const Value *Lhs,
+                                                     const Value *Rhs) const {
+  const auto isFlagOutOpCC = [](const Value *V) {
+    using namespace llvm::PatternMatch;
+    const Value *RHSVal;
+    const APInt *RHSC;
+    if (const auto *I = dyn_cast<Instruction>(V)) {
+      // PatternMatch.h provides concise tree-based pattern match of llvm IR.
+      if (match(I->getOperand(0), m_And(m_Value(RHSVal), m_APInt(RHSC))) ||
+          match(I, m_Cmp(m_Value(RHSVal), m_APInt(RHSC)))) {
+        if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
+          if (CB->isInlineAsm()) {
+            const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+            return IA &&
+                   IA->getConstraintString().find("{@cc}") != std::string::npos;
+          }
+        }
+      }
+    }
+    return false;
+  };
+  // Pattern (ICmp %asm) or (ICmp (And %asm)).
+  // Cost of longest dependency chain (ICmp, And) is 2. CostThreshold or
+  // BaseCost can be set >=2. If cost of instruction <= CostThreshold
+  // conditionals will be merged or else conditionals will be split.
+  if (isFlagOutOpCC(Lhs) && isFlagOutOpCC(Rhs))
+    return {3, 0, -1};
+  // Default.
+  return {-1, -1, -1};
+}
+
+SDValue SystemZTargetLowering::combineBR_CCMASK(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
   // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
@@ -8824,8 +9009,7 @@ SDValue SystemZTargetLowering::combineBR_CCMASK(
   int CCMaskVal = CCMask->getZExtValue();
   SDValue Chain = N->getOperand(0);
   SDValue CCReg = N->getOperand(4);
-
-  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+  if (combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG))
     return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
                        Chain,
                        DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
@@ -8848,16 +9032,80 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK(
   int CCMaskVal = CCMask->getZExtValue();
   SDValue CCReg = N->getOperand(4);
 
-  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
-    return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
-                       N->getOperand(0), N->getOperand(1),
-                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
-                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
-                       CCReg);
+  bool IsCombinedCCReg = combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG);
+
+  // Populate SDVals vector for each condition code ccval for given Val, which
+  // can again be another nested select_ccmask with the same CC.
+  const auto constructCCSDValsFromSELECT = [&CCReg](SDValue &Val) {
+    if (Val.getOpcode() == SystemZISD::SELECT_CCMASK) {
+      SmallVector<SDValue, 4> Res;
+      if (Val.getOperand(4) != CCReg)
+        return SmallVector<SDValue, 4>{};
+      SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+      auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+      if (!CCMask)
+        return SmallVector<SDValue, 4>{};
+
+      int CCMaskVal = CCMask->getZExtValue();
+      for (auto &CC : {0, 1, 2, 3})
+        Res.emplace_back(((CCMaskVal & (1 << (3 - CC))) != 0) ? TrueVal
+                                                              : FalseVal);
+      return Res;
+    }
+    return SmallVector<SDValue, 4>{Val, Val, Val, Val};
+  };
+  // Attempting to optimize TrueVal/FalseVal in outermost select_ccmask either
+  // with CCReg found by combineCCMask or original CCReg.
+  SDValue TrueVal = N->getOperand(0);
+  SDValue FalseVal = N->getOperand(1);
+  auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CCReg, DAG);
+  auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CCReg, DAG);
+  // TrueSDVals/FalseSDVals might be empty in case of non-constant
+  // TrueVal/FalseVal for select_ccmask, which can not be optimized further.
+  if (TrueSDVals.empty())
+    TrueSDVals = constructCCSDValsFromSELECT(TrueVal);
+  if (FalseSDVals.empty())
+    FalseSDVals = constructCCSDValsFromSELECT(FalseVal);
+  if (!TrueSDVals.empty() && !FalseSDVals.empty()) {
+    SmallSet<SDValue, 4> MergedSDValsSet;
+    // Ignoring CC values outside CCValiid.
+    for (auto CC : {0, 1, 2, 3}) {
+      if ((CCValidVal & ((1 << (3 - CC)))) != 0)
+        MergedSDValsSet.insert(((CCMaskVal & (1 << (3 - CC))) != 0)
+                                   ? TrueSDVals[CC]
+                                   : FalseSDVals[CC]);
+    }
+    if (MergedSDValsSet.size() == 1)
+      return *MergedSDValsSet.begin();
+    if (MergedSDValsSet.size() == 2) {
+      auto BeginIt = MergedSDValsSet.begin();
+      SDValue NewTrueVal = *BeginIt, NewFalseVal = *next(BeginIt);
+      if (NewTrueVal == FalseVal || NewFalseVal == TrueVal)
+        std::swap(NewTrueVal, NewFalseVal);
+      int NewCCMask = 0;
+      for (auto CC : {0, 1, 2, 3}) {
+        NewCCMask <<= 1;
+        NewCCMask |= ((CCMaskVal & (1 << (3 - CC))) != 0)
+                         ? (TrueSDVals[CC] == NewTrueVal)
+                         : (FalseSDVals[CC] == NewTrueVal);
+      }
+      CCMaskVal = NewCCMask;
+      CCMaskVal &= CCValidVal;
+      TrueVal = NewTrueVal;
+      FalseVal = NewFalseVal;
+      IsCombinedCCReg = true;
+    }
+  }
+
+  if (IsCombinedCCReg)
+    return DAG.getNode(
+        SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0), TrueVal,
+        FalseVal, DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+        DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32), CCReg);
+
   return SDValue();
 }
 
-
 SDValue SystemZTargetLowering::combineGET_CCMASK(
     SDNode *N, DAGCombinerInfo &DCI) const {
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f8706b7..d5b7603 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -533,6 +533,18 @@ public:
   }
 
   const char *getTargetNodeName(unsigned Opcode) const override;
+
+  // This function currently returns cost for srl/ipm/cc sequence for merging.
+  CondMergingParams
+  getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
+                                const Value *Rhs) const override;
+
+  // Handle Lowering flag assembly outputs.
+  SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+                                      const SDLoc &DL,
+                                      const AsmOperandInfo &Constraint,
+                                      SelectionDAG &DAG) const override;
+
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6472334..47c24fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
     }
 
+    if (Subtarget->hasFP16()) {
+      setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+    }
+
+    if (Subtarget->hasRelaxedSIMD()) {
+      setOperationAction(ISD::FMULADD, MVT::v4f32, Legal);
+      setOperationAction(ISD::FMULADD, MVT::v2f64, Legal);
+    }
+
     // Partial MLA reductions.
     for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
       setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
@@ -1120,6 +1129,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+    const MachineFunction &MF, EVT VT) const {
+  if (!Subtarget->hasFP16() || !VT.isVector())
+    return false;
+
+  EVT ScalarVT = VT.getScalarType();
+  if (!ScalarVT.isSimple())
+    return false;
+
+  return ScalarVT.getSimpleVT().SimpleTy == MVT::f16;
+}
+
 bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
     SDValue Op, const TargetLoweringOpt &TLO) const {
   // ISel process runs DAGCombiner after legalization; this step is called
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a853..472ec67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -81,6 +81,8 @@ private:
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(MVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 49af78b..0f6e1ca 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U :
 defm EXTMUL_HIGH_U :
   SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
 
+// Pattern for i32x4.dot_i16x8_s
+def : Pat<
+  (v4i32 (add
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 0), (i32 1), (i32 2), (i32 3),
+      (i32 8), (i32 9), (i32 10), (i32 11),
+      (i32 16), (i32 17), (i32 18), (i32 19),
+      (i32 24), (i32 25), (i32 26), (i32 27)),
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 4), (i32 5), (i32 6), (i32 7),
+      (i32 12), (i32 13), (i32 14), (i32 15),
+      (i32 20), (i32 21), (i32 22), (i32 23),
+      (i32 28), (i32 29), (i32 30), (i32 31)))
+  ),
+  (v4i32 (DOT v8i16:$lhs, v8i16:$rhs))
+>;
+
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
@@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
 // Relaxed (Negative) Multiply-Add  (madd/nmadd)
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
+multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                            list<Predicate> reqs> {
   defm MADD_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
            [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
@@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
            vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
            vec.prefix#".relaxed_nmadd", simdopS, reqs>;
 
-  def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)),
+            (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 
-  def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))),
+            (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 }
 
-defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
+
+//===----------------------------------------------------------------------===//
+// FP16 (Negative) Multiply-Add  (madd/nmadd)
+//===----------------------------------------------------------------------===//
+
+multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                                   list<Predicate> reqs> {
+  defm MADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".madd\t$dst, $a, $b, $c",
+           vec.prefix#".madd", simdopA, reqs>;
+  defm NMADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".nmadd\t$dst, $a, $b, $c",
+           vec.prefix#".nmadd", simdopS, reqs>;
+}
+defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+
+// TODO: I think separate intrinsics should be introduced for these FP16 operations.
+def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (MADD_F16x8 V128:$a, V128:$b, V128:$c)>;
+def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 28fa2cd..b81641f 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,6 +414,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
+  getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
+
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3af8b3e..2bf016a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1335,9 +1335,8 @@ def ProcessorFeatures {
     !listconcat(ARLFeatures, ARLSAdditionalFeatures);
 
   // Pantherlake
-  list<SubtargetFeature> PTLAdditionalFeatures = [FeaturePREFETCHI];
   list<SubtargetFeature> PTLFeatures =
-    !listremove(!listconcat(ARLSFeatures, PTLAdditionalFeatures), [FeatureWIDEKL]);
+    !listremove(ARLSFeatures, [FeatureWIDEKL]);
 
 
   // Clearwaterforest
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eea84a2..c32b1a6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3624,6 +3624,16 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
       match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
       match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
     BaseCost += 1;
+
+  // For OR conditions with EQ comparisons, prefer splitting into branches
+  // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
+  // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
+  // comparisons (SLT, SGT) that can be optimized.
+  if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
+      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
+    return {-1, -1, -1};
+
   return {BaseCost, BrMergingLikelyBias.getValue(),
           BrMergingUnlikelyBias.getValue()};
 }
@@ -3787,7 +3797,7 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
 
 /// Return true if every element in Mask, is an in-place blend/select mask or is
 /// undef.
-LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef<int> Mask) {
+[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
   unsigned NumElts = Mask.size();
   for (auto [I, M] : enumerate(Mask))
     if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
@@ -8096,7 +8106,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   return DstVec;
 }
 
-LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
+[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
   switch (Opcode) {
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:
@@ -20813,7 +20823,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // for DAG type consistency we have to match the FP operand type.
 
     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
-    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+    [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
@@ -22856,7 +22866,7 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // be generated by the memcmp expansion pass with oversized integer compares
   // (see PR33325).
   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
-  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+  if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
     return SDValue();
 
   // Don't perform this combine if constructing the vector will be expensive.
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index acf8e4c..5ea63a9 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -228,6 +228,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
     OS << ", mu";
 }
 
+void printXSfmmVType(unsigned VType, raw_ostream &OS) {
+  OS << "e" << getSEW(VType) << ", w" << getXSfmmWiden(VType);
+}
+
 unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
   unsigned LMul;
   bool Fractional;
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index edca7c1..1932a3a 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -175,7 +175,7 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
-    (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI;
+    (FeaturesArrowlakeS ^ FeatureWIDEKL);
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index 26ec4f3..e05fe28 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +20,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 enum class CloneKind {
   /// The shared resume function for a switch lowering.
@@ -149,8 +148,6 @@ public:
   }
 };
 
-} // end namespace coro
-
-} // end namespace llvm
+} // end namespace llvm::coro
 
 #endif // LLVM_LIB_TRANSFORMS_COROUTINES_COROCLONER_H
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 471b9eb..cdb5852 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -38,7 +38,7 @@ public:
         AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
   void lowerEarlyIntrinsics(Function &F);
 };
-}
+} // namespace
 
 // Replace a direct call to coro.resume or coro.destroy with an indirect call to
 // an address returned by coro.subfn.addr intrinsic. This is done so that
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 52f4ffe..cc47a55 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -16,11 +16,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Coroutines/CoroShape.h"
 
-namespace llvm {
-
-class CallGraph;
-
-namespace coro {
+namespace llvm::coro {
 
 bool isSuspendBlock(BasicBlock *BB);
 bool declaresAnyIntrinsic(const Module &M);
@@ -61,7 +57,6 @@ void normalizeCoroutine(Function &F, coro::Shape &Shape,
 CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                              TargetTransformInfo &TTI,
                              ArrayRef<Value *> Arguments, IRBuilder<> &);
-} // End namespace coro.
-} // End namespace llvm
+} // End namespace llvm::coro
 
 #endif
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6aaabca..f2444da 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -137,8 +137,7 @@ struct RematGraph {
 
 } // namespace
 
-namespace llvm {
-template <> struct GraphTraits<RematGraph *> {
+template <> struct llvm::GraphTraits<RematGraph *> {
   using NodeRef = RematGraph::RematNode *;
   using ChildIteratorType = RematGraph::RematNode **;
 
@@ -149,8 +148,6 @@ template <> struct GraphTraits<RematGraph *> {
   static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
 };
 
-} // end namespace llvm
-
 // For each instruction identified as materializable across the suspend point,
 // and its associated DAG of other rematerializable instructions,
 // recreate the DAG of instructions after the suspend point.
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index e474c07..81fe0c9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -16,11 +16,8 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
-namespace llvm {
-
-namespace coro {
-
-namespace {
+using namespace llvm;
+using namespace llvm::coro;
 
 typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
 
@@ -71,7 +68,7 @@ static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
 /// This happens during the all-instructions iteration, so it must not
 /// delete the call.
 static Instruction *
-lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const coro::Shape &Shape,
+lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const Shape &Shape,
                     SmallVectorImpl<Instruction *> &DeadInsts) {
   IRBuilder<> Builder(AI);
   auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
@@ -450,10 +447,8 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
                        Visitor.getMayWriteBeforeCoroBegin());
 }
 
-} // namespace
-
-void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
-                           const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromArgs(SpillInfo &Spills, Function &F,
+                                 const SuspendCrossingInfo &Checker) {
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
@@ -461,7 +456,7 @@ void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
         Spills[&A].push_back(cast<Instruction>(U));
 }
 
-void collectSpillsAndAllocasFromInsts(
+void coro::collectSpillsAndAllocasFromInsts(
     SpillInfo &Spills, SmallVector<AllocaInfo, 8> &Allocas,
     SmallVector<Instruction *, 4> &DeadInstructions,
     SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
@@ -516,8 +511,8 @@ void collectSpillsAndAllocasFromInsts(
   }
 }
 
-void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
-                              const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
+                                    const SuspendCrossingInfo &Checker) {
   // We don't want the layout of coroutine frame to be affected
   // by debug information. So we only choose to salvage dbg.values for
   // whose value is already in the frame.
@@ -535,10 +530,9 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
 
 /// Async and Retcon{Once} conventions assume that all spill uses can be sunk
 /// after the coro.begin intrinsic.
-void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
-                                 CoroBeginInst *CoroBegin,
-                                 coro::SpillInfo &Spills,
-                                 SmallVectorImpl<coro::AllocaInfo> &Allocas) {
+void coro::sinkSpillUsesAfterCoroBegin(
+    const DominatorTree &Dom, CoroBeginInst *CoroBegin, coro::SpillInfo &Spills,
+    SmallVectorImpl<coro::AllocaInfo> &Allocas) {
   SmallSetVector<Instruction *, 32> ToMove;
   SmallVector<Instruction *, 32> Worklist;
 
@@ -582,8 +576,9 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
     Inst->moveBefore(InsertPt->getIterator());
 }
 
-BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
-                                         const DominatorTree &DT) {
+BasicBlock::iterator coro::getSpillInsertionPt(const coro::Shape &Shape,
+                                               Value *Def,
+                                               const DominatorTree &DT) {
   BasicBlock::iterator InsertPt;
   if (auto *Arg = dyn_cast<Argument>(Def)) {
     // For arguments, we will place the store instruction right after
@@ -625,7 +620,3 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
 
   return InsertPt;
 }
-
-} // End namespace coro.
-
-} // End namespace llvm.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7071876..943c223 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,7 +471,6 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
-public:
   /// Create `select C, S1, S2`. Use only when the profile cannot be calculated
   /// from existing profile metadata: if the Function has profiles, this will
   /// set the profile of this select to "unknown".
@@ -484,6 +483,7 @@ public:
     return Sel;
   }
 
+public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
   void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6b67b48..09cb225 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2979,10 +2979,14 @@ Instruction *InstCombinerImpl::foldAndOrOfSelectUsingImpliedCond(Value *Op,
          "Op must be either i1 or vector of i1.");
   if (SI.getCondition()->getType() != Op->getType())
     return nullptr;
-  if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL))
-    return SelectInst::Create(Op,
-                              IsAnd ? V : ConstantInt::getTrue(Op->getType()),
-                              IsAnd ? ConstantInt::getFalse(Op->getType()) : V);
+  if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL)) {
+    Instruction *MDFrom = nullptr;
+    if (!ProfcheckDisableMetadataFixes)
+      MDFrom = &SI;
+    return SelectInst::Create(
+        Op, IsAnd ? V : ConstantInt::getTrue(Op->getType()),
+        IsAnd ? ConstantInt::getFalse(Op->getType()) : V, "", nullptr, MDFrom);
+  }
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 63e24a0..a330bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -110,8 +110,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
                               ShrAmt->getName() + ".z");
   // There is no existing !prof metadata we can derive the !prof metadata for
   // this select.
-  Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
-  IC.Builder.Insert(Select);
+  Value *Select = IC.Builder.CreateSelectWithUnknownProfile(ShrAmtZ, Lower,
+                                                            Upper, DEBUG_TYPE);
   Select->takeName(I);
   return Select;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 82ac903..3f11cae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1690,6 +1690,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
 //    2) (fp_binop ({s|u}itofp x), FpC)
 //        -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
 Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
+  // Don't perform the fold on vectors, as the integer operation may be much
+  // more expensive than the float operation in that case.
+  if (BO.getType()->isVectorTy())
+    return nullptr;
+
   std::array<Value *, 2> IntOps = {nullptr, nullptr};
   Constant *Op1FpC = nullptr;
   // Check for:
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 860f8f7..511bca4 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -3337,7 +3337,7 @@ PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
                                           Value *ValueIfTrue,
                                           Instruction *ThenTerm,
                                           Value *ValueIfFalse) {
-  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  PHINode *PHI = IRB.CreatePHI(ValueIfTrue->getType(), 2);
   BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
   PHI->addIncoming(ValueIfFalse, CondBlock);
   BasicBlock *ThenBlock = ThenTerm->getParent();
@@ -3360,7 +3360,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   uint64_t FrameAlignment = std::max(L.FrameAlignment, uint64_t(ClRealignStack));
   Alloca->setAlignment(Align(FrameAlignment));
-  return IRB.CreatePointerCast(Alloca, IntptrTy);
+  return Alloca;
 }
 
 void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
@@ -3572,10 +3572,12 @@ void FunctionStackPoisoner::processStaticAllocas() {
   DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
   DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
 
+  Type *PtrTy = F.getDataLayout().getAllocaPtrType(F.getContext());
   Value *StaticAlloca =
       DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
 
-  Value *FakeStack;
+  Value *FakeStackPtr;
+  Value *FakeStackInt;
   Value *LocalStackBase;
   Value *LocalStackBaseAlloca;
   uint8_t DIExprFlags = DIExpression::ApplyOffset;
@@ -3603,20 +3605,21 @@ void FunctionStackPoisoner::processStaticAllocas() {
           RTCI.createRuntimeCall(IRBIf, AsanStackMallocFunc[StackMallocIdx],
                                  ConstantInt::get(IntptrTy, LocalStackSize));
       IRB.SetInsertPoint(InsBefore);
-      FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
-                            ConstantInt::get(IntptrTy, 0));
+      FakeStackInt = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue,
+                               Term, ConstantInt::get(IntptrTy, 0));
     } else {
       // assert(ASan.UseAfterReturn == AsanDetectStackUseAfterReturnMode:Always)
       // void *FakeStack = __asan_stack_malloc_N(LocalStackSize);
       // void *LocalStackBase = (FakeStack) ? FakeStack :
       //                        alloca(LocalStackSize);
       StackMallocIdx = StackMallocSizeClass(LocalStackSize);
-      FakeStack =
+      FakeStackInt =
           RTCI.createRuntimeCall(IRB, AsanStackMallocFunc[StackMallocIdx],
                                  ConstantInt::get(IntptrTy, LocalStackSize));
     }
+    FakeStackPtr = IRB.CreateIntToPtr(FakeStackInt, PtrTy);
     Value *NoFakeStack =
-        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+        IRB.CreateICmpEQ(FakeStackInt, Constant::getNullValue(IntptrTy));
     Instruction *Term =
         SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
     IRBuilder<> IRBIf(Term);
@@ -3624,67 +3627,53 @@ void FunctionStackPoisoner::processStaticAllocas() {
         DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
 
     IRB.SetInsertPoint(InsBefore);
-    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+    LocalStackBase =
+        createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStackPtr);
     IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
     DIExprFlags |= DIExpression::DerefBefore;
   } else {
     // void *FakeStack = nullptr;
     // void *LocalStackBase = alloca(LocalStackSize);
-    FakeStack = ConstantInt::get(IntptrTy, 0);
+    FakeStackInt = Constant::getNullValue(IntptrTy);
+    FakeStackPtr = Constant::getNullValue(PtrTy);
     LocalStackBase =
         DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
     LocalStackBaseAlloca = LocalStackBase;
   }
 
-  // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
-  // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
-  // later passes and can result in dropped variable coverage in debug info.
-  Value *LocalStackBaseAllocaPtr =
-      isa<PtrToIntInst>(LocalStackBaseAlloca)
-          ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
-          : LocalStackBaseAlloca;
-  assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
-         "Variable descriptions relative to ASan stack base will be dropped");
-
   // Replace Alloca instructions with base+offset.
   SmallVector<Value *> NewAllocaPtrs;
   for (const auto &Desc : SVD) {
     AllocaInst *AI = Desc.AI;
-    replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
-                      Desc.Offset);
-    Value *NewAllocaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
-        AI->getType());
+    replaceDbgDeclare(AI, LocalStackBaseAlloca, DIB, DIExprFlags, Desc.Offset);
+    Value *NewAllocaPtr = IRB.CreatePtrAdd(
+        LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset));
     AI->replaceAllUsesWith(NewAllocaPtr);
     NewAllocaPtrs.push_back(NewAllocaPtr);
   }
 
   // The left-most redzone has enough space for at least 4 pointers.
   // Write the Magic value to redzone[0].
-  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
   IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
-                  BasePlus0);
+                  LocalStackBase);
   // Write the frame description constant to redzone[1].
-  Value *BasePlus1 = IRB.CreateIntToPtr(
-      IRB.CreateAdd(LocalStackBase,
-                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
-      IntptrPtrTy);
+  Value *BasePlus1 = IRB.CreatePtrAdd(
+      LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize / 8));
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), DescriptionString,
                                    /*AllowMerging*/ true, genName("stack"));
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
   // Write the PC to redzone[2].
-  Value *BasePlus2 = IRB.CreateIntToPtr(
-      IRB.CreateAdd(LocalStackBase,
-                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
-      IntptrPtrTy);
+  Value *BasePlus2 = IRB.CreatePtrAdd(
+      LocalStackBase, ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8));
   IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
 
   // Poison the stack red zones at the entry.
-  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  Value *ShadowBase =
+      ASan.memToShadow(IRB.CreatePtrToInt(LocalStackBase, IntptrTy), IRB);
   // As mask we must use most poisoned case: red zones and after scope.
   // As bytes we can use either the same or just red zones only.
   copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
@@ -3723,7 +3712,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
     IRBuilder<> IRBRet(Ret);
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
-                       BasePlus0);
+                       LocalStackBase);
     if (DoStackMalloc) {
       assert(StackMallocIdx >= 0);
       // if FakeStack != 0  // LocalStackBase == FakeStack
@@ -3737,7 +3726,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       // else
       //     <This is not a fake stack; unpoison the redzones>
       Value *Cmp =
-          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+          IRBRet.CreateICmpNE(FakeStackInt, Constant::getNullValue(IntptrTy));
       Instruction *ThenTerm, *ElseTerm;
       SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
 
@@ -3748,11 +3737,10 @@ void FunctionStackPoisoner::processStaticAllocas() {
                                  kAsanStackUseAfterReturnMagic);
         copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
                      ShadowBase);
-        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
-            FakeStack,
+        Value *SavedFlagPtrPtr = IRBPoison.CreatePtrAdd(
+            FakeStackPtr,
             ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
-        Value *SavedFlagPtr = IRBPoison.CreateLoad(
-            IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(IntptrTy, SavedFlagPtrPtr);
         IRBPoison.CreateStore(
             Constant::getNullValue(IRBPoison.getInt8Ty()),
             IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getPtrTy()));
@@ -3760,7 +3748,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
         // For larger frames call __asan_stack_free_*.
         RTCI.createRuntimeCall(
             IRBPoison, AsanStackFreeFunc[StackMallocIdx],
-            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+            {FakeStackInt, ConstantInt::get(IntptrTy, LocalStackSize)});
       }
 
       IRBuilder<> IRBElse(ElseTerm);
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index c86092b..a6ec6c1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StaticDataProfileInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee,
   }
 }
 
+static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
+                                             AnnotationKind Kind) {
+  assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
+         "Should not handle AnnotationOK here");
+  SmallString<32> Reason;
+  switch (Kind) {
+  case llvm::memprof::AnnotationKind::ExplicitSection:
+    ++NumOfMemProfExplicitSectionGlobalVars;
+    Reason.append("explicit section name");
+    break;
+  case llvm::memprof::AnnotationKind::DeclForLinker:
+    Reason.append("linker declaration");
+    break;
+  case llvm::memprof::AnnotationKind::ReservedName:
+    Reason.append("name starts with `llvm.`");
+    break;
+  default:
+    llvm_unreachable("Unexpected annotation kind");
+  }
+  LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
+                    << Reason << ".\n");
+  return;
+}
+
 struct AllocMatchInfo {
   uint64_t TotalSize = 0;
   AllocationType AllocType = AllocationType::None;
@@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   return PreservedAnalyses::none();
 }
 
-// Returns true iff the global variable has custom section either by
-// __attribute__((section("name")))
-// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
-// or #pragma clang section directives
-// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
-static bool hasExplicitSectionName(const GlobalVariable &GVar) {
-  if (GVar.hasSection())
-    return true;
-
-  auto Attrs = GVar.getAttributes();
-  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
-      Attrs.hasAttribute("relro-section") ||
-      Attrs.hasAttribute("rodata-section"))
-    return true;
-  return false;
-}
-
 bool MemProfUsePass::annotateGlobalVariables(
     Module &M, const memprof::DataAccessProfData *DataAccessProf) {
   if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
     return false;
 
   if (!DataAccessProf) {
+    M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U);
     M.getContext().diagnose(DiagnosticInfoPGOProfile(
         MemoryProfileFileName.data(),
         StringRef("Data access profiles not found in memprof. Ignore "
@@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables(
         DS_Warning));
     return false;
   }
+  M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U);
 
   bool Changed = false;
   // Iterate all global variables in the module and annotate them based on
@@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables(
   for (GlobalVariable &GVar : M.globals()) {
     assert(!GVar.getSectionPrefix().has_value() &&
            "GVar shouldn't have section prefix yet");
-    if (GVar.isDeclarationForLinker())
-      continue;
-
-    if (hasExplicitSectionName(GVar)) {
-      ++NumOfMemProfExplicitSectionGlobalVars;
-      LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
-                        << " has explicit section name. Skip annotating.\n");
+    auto Kind = llvm::memprof::getAnnotationKind(GVar);
+    if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
+      HandleUnsupportedAnnotationKinds(GVar, Kind);
       continue;
     }
 
@@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables(
     // TODO: Track string content hash in the profiles and compute it inside the
     // compiler to categeorize the hotness string literals.
     if (Name.starts_with(".str")) {
-
       LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
       continue;
     }
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ff5f390..66e45ec 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -266,8 +266,7 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
     if (!ProfcheckDisableMetadataFixes)
       BI->setMetadata(LLVMContext::MD_prof,
                       SI->getMetadata(LLVMContext::MD_prof));
-    DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock},
-                       {DominatorTree::Insert, StartBlock, NewBlock}});
+    DTU->applyUpdates({{DominatorTree::Insert, StartBlock, NewBlock}});
   } else {
     BasicBlock *EndBlock = SIUse->getParent();
     BasicBlock *NewBlockT = BasicBlock::Create(
@@ -1479,10 +1478,13 @@ bool DFAJumpThreading::run(Function &F) {
   DTU->flush();
 
 #ifdef EXPENSIVE_CHECKS
-  assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full));
   verifyFunction(F, &dbgs());
 #endif
 
+  if (MadeChanges && VerifyDomInfo)
+    assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full) &&
+           "Failed to maintain validity of domtree!");
+
   return MadeChanges;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 213d0f3..1335665 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -39,10 +39,11 @@ public:
 private:
   AliasAnalysis *AA;
 };
+} // namespace
 
 /// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -67,7 +68,6 @@ bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   }
   return Changed;
 }
-} // namespace
 
 char FlattenCFGLegacyPass::ID = 0;
 
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 56e0569..7cae94eb 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1295,6 +1295,24 @@ public:
     return commonAlignment(InitialAlign, ElementSizeInBits / 8);
   }
 
+  IntegerType *getIndexType(Value *Ptr) const {
+    return cast<IntegerType>(DL.getIndexType(Ptr->getType()));
+  }
+
+  Value *getIndex(Value *Ptr, uint64_t V) const {
+    return ConstantInt::get(getIndexType(Ptr), V);
+  }
+
+  Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {
+    assert(isa<IntegerType>(V->getType()) &&
+           "Attempted to cast non-integral type to integer index");
+    // In case the data layout's index type differs in width from the type of
+    // the value we're given, truncate or zero extend to the appropriate width.
+    // We zero extend here as indices are unsigned.
+    return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),
+                                     V->getName() + ".cast");
+  }
+
   /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
   /// vectors.
   MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
@@ -1304,6 +1322,7 @@ public:
     Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());
     Value *EltPtr = Ptr;
     MatrixTy Result;
+    Stride = castToIndexType(Ptr, Stride, Builder);
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
       Value *GEP = computeVectorAddr(
           EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
@@ -1325,14 +1344,14 @@ public:
                       ShapeInfo ResultShape, Type *EltTy,
                       IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
 
     Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
                                                    ResultShape.NumColumns);
 
     return loadMatrix(TileTy, TileStart, Align,
-                      Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+                      getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
                       ResultShape, Builder);
   }
 
@@ -1363,14 +1382,15 @@ public:
                    MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
                    Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
 
     Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
                                                    StoreVal.getNumColumns());
 
     storeMatrix(TileTy, StoreVal, TileStart, MAlign,
-                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+                getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
+                Builder);
   }
 
   /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
@@ -1380,6 +1400,7 @@ public:
                        IRBuilder<> &Builder) {
     auto *VType = cast<FixedVectorType>(Ty);
     Value *EltPtr = Ptr;
+    Stride = castToIndexType(Ptr, Stride, Builder);
     for (auto Vec : enumerate(StoreVal.vectors())) {
       Value *GEP = computeVectorAddr(
           EltPtr,
@@ -2011,18 +2032,17 @@ public:
             const unsigned TileM = std::min(M - K, unsigned(TileSize));
             MatrixTy A =
                 loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
-                           LShape, Builder.getInt64(I), Builder.getInt64(K),
+                           LShape, getIndex(APtr, I), getIndex(APtr, K),
                            {TileR, TileM}, EltType, Builder);
             MatrixTy B =
                 loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
-                           RShape, Builder.getInt64(K), Builder.getInt64(J),
+                           RShape, getIndex(BPtr, K), getIndex(BPtr, J),
                            {TileM, TileC}, EltType, Builder);
             emitMatrixMultiply(Res, A, B, Builder, true, false,
                                getFastMathFlags(MatMul));
           }
           storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
-                      Builder.getInt64(I), Builder.getInt64(J), EltType,
-                      Builder);
+                      getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);
         }
     }
 
@@ -2254,15 +2274,14 @@ public:
   /// Lower load instructions.
   MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
                      IRBuilder<> &Builder) {
-    return LowerLoad(Inst, Ptr, Inst->getAlign(),
-                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
-                     Builder);
+    return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),
+                     Inst->isVolatile(), SI, Builder);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
                       Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                      getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,
                       Builder);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index b9d332b..d2f09e9 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -118,9 +118,13 @@ STATISTIC(
 STATISTIC(NumDeleted, "Number of instructions deleted");
 STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
+namespace llvm {
 /// Disable running mem2reg during SROA in order to test or debug SROA.
 static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
                                      cl::Hidden);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
+
 namespace {
 
 class AllocaSliceRewriter;
@@ -1777,7 +1781,8 @@ static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
   }
 
   Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
-                              LI.getName() + ".sroa.speculated");
+                              LI.getName() + ".sroa.speculated",
+                              ProfcheckDisableMetadataFixes ? nullptr : &SI);
 
   LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
   LI.replaceAllUsesWith(V);
@@ -4360,10 +4365,13 @@ private:
     };
 
     Value *Cond, *True, *False;
+    Instruction *MDFrom = nullptr;
     if (auto *SI = dyn_cast<SelectInst>(Sel)) {
       Cond = SI->getCondition();
       True = SI->getTrueValue();
       False = SI->getFalseValue();
+      if (!ProfcheckDisableMetadataFixes)
+        MDFrom = SI;
     } else {
       Cond = Sel->getOperand(0);
       True = ConstantInt::get(Sel->getType(), 1);
@@ -4383,8 +4391,12 @@ private:
         IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
                       False->getName() + ".sroa.gep", NW);
 
-    Value *NSel =
-        IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
+    Value *NSel = MDFrom
+                      ? IRB.CreateSelect(Cond, NTrue, NFalse,
+                                         Sel->getName() + ".sroa.sel", MDFrom)
+                      : IRB.CreateSelectWithUnknownProfile(
+                            Cond, NTrue, NFalse, DEBUG_TYPE,
+                            Sel->getName() + ".sroa.sel");
     Visited.erase(&GEPI);
     GEPI.replaceAllUsesWith(NSel);
     GEPI.eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2ee91a9..0f3978f 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
 #include <cassert>
 #include <utility>
 
@@ -321,7 +322,7 @@ class StructurizeCFG {
 
   void collectInfos();
 
-  void insertConditions(bool Loops);
+  void insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter);
 
   void simplifyConditions();
 
@@ -671,10 +672,9 @@ void StructurizeCFG::collectInfos() {
 }
 
 /// Insert the missing branch conditions
-void StructurizeCFG::insertConditions(bool Loops) {
+void StructurizeCFG::insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
-  SSAUpdater PhiInserter;
 
   for (BranchInst *Term : Conds) {
     assert(Term->isConditional());
@@ -683,8 +683,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
     BasicBlock *SuccTrue = Term->getSuccessor(0);
     BasicBlock *SuccFalse = Term->getSuccessor(1);
 
-    PhiInserter.Initialize(Boolean, "");
-    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+    unsigned Variable = PhiInserter.AddVariable("", Boolean);
+    PhiInserter.AddAvailableValue(Variable, Loops ? SuccFalse : Parent,
+                                  Default);
 
     BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
 
@@ -697,7 +698,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
         ParentInfo = PI;
         break;
       }
-      PhiInserter.AddAvailableValue(BB, PI.Pred);
+      PhiInserter.AddAvailableValue(Variable, BB, PI.Pred);
       Dominator.addAndRememberBlock(BB);
     }
 
@@ -706,9 +707,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
       CondBranchWeights::setMetadata(*Term, ParentInfo.Weights);
     } else {
       if (!Dominator.resultIsRememberedBlock())
-        PhiInserter.AddAvailableValue(Dominator.result(), Default);
+        PhiInserter.AddAvailableValue(Variable, Dominator.result(), Default);
 
-      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+      PhiInserter.AddUse(Variable, &Term->getOperandUse(0));
     }
   }
 }
@@ -1414,8 +1415,12 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT,
   orderNodes();
   collectInfos();
   createFlow();
-  insertConditions(false);
-  insertConditions(true);
+
+  SSAUpdaterBulk PhiInserter;
+  insertConditions(false, PhiInserter);
+  insertConditions(true, PhiInserter);
+  PhiInserter.RewriteAndOptimizeAllUses(*DT);
+
   setPhiValues();
   simplifyHoistedPhis();
   simplifyConditions();
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index b187208..32924e7 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
 STATISTIC(RemappedAtomMax, "Highest global NextAtomGroup (after mapping)");
 
 void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
-  auto CurGroup = DL->getAtomGroup();
+  uint64_t CurGroup = DL->getAtomGroup();
   if (!CurGroup)
     return;
 
@@ -62,21 +62,20 @@ void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
   RemappedAtomMax = std::max<uint64_t>(NewGroup, RemappedAtomMax);
 }
 
-namespace {
-void collectDebugInfoFromInstructions(const Function &F,
-                                      DebugInfoFinder &DIFinder) {
+static void collectDebugInfoFromInstructions(const Function &F,
+                                             DebugInfoFinder &DIFinder) {
   const Module *M = F.getParent();
-  if (M) {
-    // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
-    for (const auto &I : instructions(F))
-      DIFinder.processInstruction(*M, I);
-  }
+  if (!M)
+    return;
+  // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
+  for (const Instruction &I : instructions(F))
+    DIFinder.processInstruction(*M, I);
 }
 
 // Create a predicate that matches the metadata that should be identity mapped
 // during function cloning.
-MetadataPredicate createIdentityMDPredicate(const Function &F,
-                                            CloneFunctionChangeType Changes) {
+static MetadataPredicate
+createIdentityMDPredicate(const Function &F, CloneFunctionChangeType Changes) {
   if (Changes >= CloneFunctionChangeType::DifferentModule)
     return [](const Metadata *MD) { return false; };
 
@@ -107,7 +106,6 @@ MetadataPredicate createIdentityMDPredicate(const Function &F,
     return false;
   };
 }
-} // namespace
 
 /// See comments in Cloning.h.
 BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
@@ -213,10 +211,9 @@ void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc,
                                      const MetadataPredicate *IdentityMD) {
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc.getAllMetadata(MDs);
-  for (auto MD : MDs) {
-    NewFunc.addMetadata(MD.first,
-                        *MapMetadata(MD.second, VMap, RemapFlag, TypeMapper,
-                                     Materializer, IdentityMD));
+  for (const auto &[Kind, MD] : MDs) {
+    NewFunc.addMetadata(Kind, *MapMetadata(MD, VMap, RemapFlag, TypeMapper,
+                                           Materializer, IdentityMD));
   }
 }
 
@@ -235,7 +232,6 @@ void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc,
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
   for (const BasicBlock &BB : OldFunc) {
-
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB =
         CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo);
@@ -321,7 +317,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
   // Cloning is always a Module level operation, since Metadata needs to be
   // cloned.
-  const auto RemapFlag = RF_None;
+  const RemapFlags RemapFlag = RF_None;
 
   CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper,
                             Materializer, &IdentityMD);
@@ -346,8 +342,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // visiting the metadata attached to global values, which would allow this
   // code to be deleted. Alternatively, perhaps give responsibility for this
   // update to CloneFunctionInto's callers.
-  auto *NewModule = NewFunc->getParent();
-  auto *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+  Module *NewModule = NewFunc->getParent();
+  NamedMDNode *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
   // Avoid multiple insertions of the same DICompileUnit to NMD.
   SmallPtrSet<const void *, 8> Visited(llvm::from_range, NMD->operands());
 
@@ -355,7 +351,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // the function (e.g. as instructions' scope).
   DebugInfoFinder DIFinder;
   collectDebugInfoFromInstructions(*OldFunc, DIFinder);
-  for (auto *Unit : DIFinder.compile_units()) {
+  for (DICompileUnit *Unit : DIFinder.compile_units()) {
     MDNode *MappedUnit =
         MapMetadata(Unit, VMap, RF_None, TypeMapper, Materializer);
     if (Visited.insert(MappedUnit).second)
@@ -821,17 +817,16 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
         --PredCount[Pred];
 
       // Figure out how many entries to remove from each PHI.
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        ++PredCount[PN->getIncomingBlock(i)];
+      for (BasicBlock *Pred : PN->blocks())
+        ++PredCount[Pred];
 
       // At this point, the excess predecessor entries are positive in the
       // map.  Loop over all of the PHIs and remove excess predecessor
       // entries.
       BasicBlock::iterator I = NewBB->begin();
       for (; (PN = dyn_cast<PHINode>(I)); ++I) {
-        for (const auto &PCI : PredCount) {
-          BasicBlock *Pred = PCI.first;
-          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+        for (const auto &[Pred, Count] : PredCount) {
+          for ([[maybe_unused]] unsigned _ : llvm::seq<unsigned>(Count))
             PN->removeIncomingValue(Pred, false);
         }
       }
@@ -866,8 +861,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // As phi-nodes have been now remapped, allow incremental simplification of
   // newly-cloned instructions.
   const DataLayout &DL = NewFunc->getDataLayout();
-  for (const auto &BB : *OldFunc) {
-    for (const auto &I : BB) {
+  for (const BasicBlock &BB : *OldFunc) {
+    for (const Instruction &I : BB) {
       auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
       if (!NewI)
         continue;
@@ -997,8 +992,8 @@ void llvm::CloneAndPruneFunctionInto(
 void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
                                      ValueToValueMapTy &VMap) {
   // Rewrite the code to refer to itself.
-  for (auto *BB : Blocks) {
-    for (auto &Inst : *BB) {
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &Inst : *BB) {
       RemapDbgRecordRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
                           RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&Inst, VMap,
@@ -1151,9 +1146,9 @@ void llvm::cloneNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
                               StringRef Ext, LLVMContext &Context) {
   MDBuilder MDB(Context);
 
-  for (auto *ScopeList : NoAliasDeclScopes) {
-    for (const auto &MDOperand : ScopeList->operands()) {
-      if (MDNode *MD = dyn_cast<MDNode>(MDOperand)) {
+  for (MDNode *ScopeList : NoAliasDeclScopes) {
+    for (const MDOperand &MDOp : ScopeList->operands()) {
+      if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
         AliasScopeNode SNANode(MD);
 
         std::string Name;
@@ -1177,7 +1172,7 @@ void llvm::adaptNoAliasScopes(Instruction *I,
   auto CloneScopeList = [&](const MDNode *ScopeList) -> MDNode * {
     bool NeedsReplacement = false;
     SmallVector<Metadata *, 8> NewScopeList;
-    for (const auto &MDOp : ScopeList->operands()) {
+    for (const MDOperand &MDOp : ScopeList->operands()) {
       if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
         if (auto *NewMD = ClonedScopes.lookup(MD)) {
           NewScopeList.push_back(NewMD);
@@ -1193,12 +1188,12 @@ void llvm::adaptNoAliasScopes(Instruction *I,
   };
 
   if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
-    if (auto *NewScopeList = CloneScopeList(Decl->getScopeList()))
+    if (MDNode *NewScopeList = CloneScopeList(Decl->getScopeList()))
       Decl->setScopeList(NewScopeList);
 
   auto replaceWhenNeeded = [&](unsigned MD_ID) {
     if (const MDNode *CSNoAlias = I->getMetadata(MD_ID))
-      if (auto *NewScopeList = CloneScopeList(CSNoAlias))
+      if (MDNode *NewScopeList = CloneScopeList(CSNoAlias))
         I->setMetadata(MD_ID, NewScopeList);
   };
   replaceWhenNeeded(LLVMContext::MD_noalias);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index d7bf791..fb39fdd 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 
@@ -112,7 +112,7 @@ struct BBValueInfo {
 void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
                                     SmallVectorImpl<PHINode *> *InsertedPHIs) {
   DenseMap<BasicBlock *, BBValueInfo> BBInfos;
-  for (auto &R : Rewrites) {
+  for (RewriteInfo &R : Rewrites) {
     BBInfos.clear();
 
     // Compute locations for new phi-nodes.
@@ -145,7 +145,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
       BBInfos[BB].LiveOutValue = V;
 
     // We've computed IDF, now insert new phi-nodes there.
-    for (auto *FrontierBB : IDFBlocks) {
+    for (BasicBlock *FrontierBB : IDFBlocks) {
       IRBuilder<> B(FrontierBB, FrontierBB->begin());
       PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
       BBInfos[FrontierBB].LiveInValue = PN;
@@ -156,7 +156,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     // IsLiveOut indicates whether we are computing live-out values (true) or
     // live-in values (false).
     auto ComputeValue = [&](BasicBlock *BB, bool IsLiveOut) -> Value * {
-      auto *BBInfo = &BBInfos[BB];
+      BBValueInfo *BBInfo = &BBInfos[BB];
 
       if (IsLiveOut && BBInfo->LiveOutValue)
         return BBInfo->LiveOutValue;
@@ -187,7 +187,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
       if (!V)
         V = UndefValue::get(R.Ty);
 
-      for (auto *BBInfo : Stack)
+      for (BBValueInfo *BBInfo : Stack)
         // Loop above can insert new entries into the BBInfos map: assume the
         // map shouldn't grow due to [1] and BBInfo references are valid.
         BBInfo->LiveInValue = V;
@@ -196,7 +196,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     };
 
     // Fill in arguments of the inserted PHIs.
-    for (auto *BB : IDFBlocks) {
+    for (BasicBlock *BB : IDFBlocks) {
       auto *PHI = cast<PHINode>(&BB->front());
       for (BasicBlock *Pred : PredCache.get(BB))
         PHI->addIncoming(ComputeValue(Pred, /*IsLiveOut=*/true), Pred);
@@ -222,3 +222,96 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     }
   }
 }
+
+// Perform a single pass of simplification over the worklist of PHIs.
+// This should be called after RewriteAllUses() because simplifying PHIs
+// immediately after creation would require updating all references to those
+// PHIs in the BBValueInfo structures, which would necessitate additional
+// reference tracking overhead.
+static void simplifyPass(MutableArrayRef<PHINode *> Worklist,
+                         const DataLayout &DL) {
+  for (PHINode *&PHI : Worklist) {
+    if (Value *Simplified = simplifyInstruction(PHI, DL)) {
+      PHI->replaceAllUsesWith(Simplified);
+      PHI->eraseFromParent();
+      PHI = nullptr; // Mark as removed.
+    }
+  }
+}
+
+#ifndef NDEBUG // Should this be under EXPENSIVE_CHECKS?
+// New PHI nodes should not reference one another but they may reference
+// themselves or existing PHI nodes, and existing PHI nodes may reference new
+// PHI nodes.
+static bool
+PHIAreRefEachOther(const iterator_range<BasicBlock::phi_iterator> NewPHIs) {
+  SmallPtrSet<PHINode *, 8> NewPHISet;
+  for (PHINode &PN : NewPHIs)
+    NewPHISet.insert(&PN);
+  for (PHINode &PHI : NewPHIs) {
+    for (Value *V : PHI.incoming_values()) {
+      PHINode *IncPHI = dyn_cast<PHINode>(V);
+      if (IncPHI && IncPHI != &PHI && NewPHISet.contains(IncPHI))
+        return true;
+    }
+  }
+  return false;
+}
+#endif
+
+static bool replaceIfIdentical(PHINode &PHI, PHINode &ReplPHI) {
+  if (!PHI.isIdenticalToWhenDefined(&ReplPHI))
+    return false;
+  PHI.replaceAllUsesWith(&ReplPHI);
+  PHI.eraseFromParent();
+  return true;
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+                                   BasicBlock::phi_iterator FirstExistingPN) {
+  assert(!PHIAreRefEachOther(make_range(BB->phis().begin(), FirstExistingPN)));
+
+  // Deduplicate new PHIs first to reduce the number of comparisons on the
+  // following new -> existing pass.
+  bool Changed = false;
+  for (auto I = BB->phis().begin(); I != FirstExistingPN; ++I) {
+    for (auto J = std::next(I); J != FirstExistingPN;) {
+      Changed |= replaceIfIdentical(*J++, *I);
+    }
+  }
+
+  // Iterate over existing PHIs and replace identical new PHIs.
+  for (PHINode &ExistingPHI : make_range(FirstExistingPN, BB->phis().end())) {
+    auto I = BB->phis().begin();
+    assert(I != FirstExistingPN); // Should be at least one new PHI.
+    do {
+      Changed |= replaceIfIdentical(*I++, ExistingPHI);
+    } while (I != FirstExistingPN);
+    if (BB->phis().begin() == FirstExistingPN)
+      return Changed;
+  }
+  return Changed;
+}
+
+static void deduplicatePass(ArrayRef<PHINode *> Worklist) {
+  SmallDenseMap<BasicBlock *, unsigned> BBs;
+  for (PHINode *PHI : Worklist) {
+    if (PHI)
+      ++BBs[PHI->getParent()];
+  }
+
+  for (auto [BB, NumNewPHIs] : BBs) {
+    auto FirstExistingPN = std::next(BB->phis().begin(), NumNewPHIs);
+    EliminateNewDuplicatePHINodes(BB, FirstExistingPN);
+  }
+}
+
+void SSAUpdaterBulk::RewriteAndOptimizeAllUses(DominatorTree &DT) {
+  SmallVector<PHINode *, 4> PHIs;
+  RewriteAllUses(&DT, &PHIs);
+  if (PHIs.empty())
+    return;
+
+  simplifyPass(PHIs, PHIs.front()->getParent()->getDataLayout());
+  deduplicatePass(PHIs);
+}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 45cee1e..9035e58 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -526,7 +526,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
   // Recognize the canonical representation of an unsimplifed urem.
   const SCEV *URemLHS = nullptr;
   const SCEV *URemRHS = nullptr;
-  if (SE.matchURem(S, URemLHS, URemRHS)) {
+  if (match(S, m_scev_URem(m_SCEV(URemLHS), m_SCEV(URemRHS), SE))) {
     Value *LHS = expand(URemLHS);
     Value *RHS = expand(URemRHS);
     return InsertBinop(Instruction::URem, LHS, RHS, SCEV::FlagAnyWrap,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6f4bec..88af2cf 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10659,7 +10659,8 @@ class InstructionsCompatibilityAnalysis {
   static bool isSupportedOpcode(const unsigned Opcode) {
     return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
            Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
-           Opcode == Instruction::UDiv;
+           Opcode == Instruction::UDiv || Opcode == Instruction::And ||
+           Opcode == Instruction::Or || Opcode == Instruction::Xor;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -10984,6 +10985,9 @@ public:
       case Instruction::Shl:
       case Instruction::SDiv:
       case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
@@ -19456,7 +19460,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
       assert(getNumElements(Cond->getType()) == TrueNumElements &&
              "Cannot vectorize Instruction::Select");
-      Value *V = Builder.CreateSelect(Cond, True, False);
+      Value *V =
+          Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
       V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
@@ -23576,18 +23581,19 @@ class HorizontalReduction {
     switch (Kind) {
     case RecurKind::Or: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
-            RHS, Name);
+            RHS, DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     }
     case RecurKind::And: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, RHS,
-            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
+            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
+            DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
@@ -23608,7 +23614,8 @@ class HorizontalReduction {
       if (UseSelect) {
         CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
         Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+        return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
+                                                      Name);
       }
       [[fallthrough]];
     case RecurKind::FMax:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1fea068..0101942 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -635,9 +635,9 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
   const VPRecipeBase *R = &VPBB->back();
   bool IsSwitch = isa<VPInstruction>(R) &&
                   cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
-  bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
-                      match(R, m_BranchOnCond(m_VPValue())) ||
-                      match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
+  bool IsCondBranch =
+      isa<VPBranchOnMaskRecipe>(R) ||
+      match(R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()));
   (void)IsCondBranch;
   (void)IsSwitch;
   if (VPBB->getNumSuccessors() == 2 ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fb696be..23f5623 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -24,12 +24,9 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
-#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
@@ -41,10 +38,11 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/InstructionCost.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <functional>
 #include <string>
+#include <utility>
 
 namespace llvm {
 
@@ -346,13 +344,6 @@ public:
   /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
-  /// Return true if it is legal to hoist instructions into this block.
-  bool isLegalToHoistInto() {
-    // There are currently no constraints that prevent an instruction to be
-    // hoisted into a VPBlockBase.
-    return true;
-  }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
     OS << getName();
@@ -1064,6 +1055,7 @@ public:
     ResumeForEpilogue,
     /// Returns the value for vscale.
     VScale,
+    OpsEnd = VScale,
   };
 
   /// Returns true if this VPInstruction generates scalar values for all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 81deba2..c0147ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -433,8 +433,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
   // We are about to replace the branch to exit the region. Remove the original
   // BranchOnCond, if there is any.
   DebugLoc LatchDL = DL;
-  if (!LatchVPBB->empty() &&
-      match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) {
+  if (!LatchVPBB->empty() && match(&LatchVPBB->back(), m_BranchOnCond())) {
     LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
     LatchVPBB->getTerminator()->eraseFromParent();
   }
@@ -480,8 +479,7 @@ static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
 
 static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
                                PredicatedScalarEvolution &PSE, Loop *TheLoop) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 
   auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
   canonicalHeaderAndLatch(HeaderVPBB, VPDT);
@@ -623,8 +621,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
 }
 
 void VPlanTransforms::createLoopRegions(VPlan &Plan) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
   for (VPBlockBase *HeaderVPB : vp_post_order_shallow(Plan.getEntry()))
     if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
       createLoopRegion(Plan, HeaderVPB);
@@ -875,8 +872,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
            Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
     auto *VPBB = cast<VPBasicBlock>(VPB);
     for (auto &R : *VPBB) {
-      if (R.mayWriteToMemory() &&
-          !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+      if (R.mayWriteToMemory() && !match(&R, m_BranchOnCount()))
         return false;
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 577432f..44506f5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -39,7 +39,6 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
   using Base = DominatorTreeBase<VPBlockBase, false>;
 
 public:
-  VPDominatorTree() = default;
   explicit VPDominatorTree(VPlan &Plan) { recalculate(Plan); }
 
   /// Returns true if \p A properly dominates \p B.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 555efea..b42b049 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -344,6 +344,10 @@ m_Freeze(const Op0_t &Op0) {
   return m_VPInstruction<Instruction::Freeze>(Op0);
 }
 
+inline VPInstruction_match<VPInstruction::BranchOnCond> m_BranchOnCond() {
+  return m_VPInstruction<VPInstruction::BranchOnCond>();
+}
+
 template <typename Op0_t>
 inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
 m_BranchOnCond(const Op0_t &Op0) {
@@ -374,6 +378,10 @@ m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
   return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
 }
 
+inline VPInstruction_match<VPInstruction::BranchOnCount> m_BranchOnCount() {
+  return m_VPInstruction<VPInstruction::BranchOnCount>();
+}
+
 template <typename Op0_t, typename Op1_t>
 inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
 m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb8820..40b7e8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1658,7 +1658,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   auto *Term = &ExitingVPBB->back();
   VPValue *Cond;
   ScalarEvolution &SE = *PSE.getSE();
-  if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+  if (match(Term, m_BranchOnCount()) ||
       match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
                       m_VPValue(), m_VPValue(), m_VPValue()))))) {
     // Try to simplify the branch condition if TC <= VF * UF when the latch
@@ -1909,8 +1909,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
 
 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
                                                   VPBuilder &LoopBuilder) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 
   SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
   for (VPRecipeBase &R :
@@ -1992,6 +1991,13 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         .Case<VPWidenIntrinsicRecipe>([](auto *I) {
           return std::make_pair(true, I->getVectorIntrinsicID());
         })
+        .Case<VPVectorPointerRecipe>([](auto *I) {
+          // For recipes that do not directly map to LLVM IR instructions,
+          // assign opcodes after the last VPInstruction opcode (which is also
+          // after the last IR Instruction opcode), based on the VPDefID.
+          return std::make_pair(false,
+                                VPInstruction::OpsEnd + 1 + I->getVPDefID());
+        })
         .Default([](auto *) { return std::nullopt; });
   }
 
@@ -2015,11 +2021,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
   static bool canHandle(const VPSingleDefRecipe *Def) {
     // We can extend the list of handled recipes in the future,
     // provided we account for the data embedded in them while checking for
-    // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
-    // as it is essentially a GEP with different semantics.
-    auto C = isa<VPVectorPointerRecipe>(Def)
-                 ? std::make_pair(false, Instruction::GetElementPtr)
-                 : getOpcodeOrIntrinsicID(Def);
+    // equality or hashing.
+    auto C = getOpcodeOrIntrinsicID(Def);
 
     // The issue with (Insert|Extract)Value is that the index of the
     // insert/extract is not a proper operand in LLVM IR, and hence also not in
@@ -2058,6 +2061,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
         !equal(L->operands(), R->operands()))
       return false;
+    assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
+           "must have valid opcode info for both recipes");
     if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
       if (LFlags->hasPredicate() &&
           LFlags->getPredicate() !=
@@ -3021,8 +3026,7 @@ void VPlanTransforms::createInterleaveGroups(
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
   for (const auto *IG : InterleaveGroups) {
     auto *Start =
         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
@@ -3398,9 +3402,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
 
   VPBuilder Builder(LatchVPBB->getTerminator());
   VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
-  assert(
-      match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
-      "Terminator must be be BranchOnCond");
+  assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
+         "Terminator must be be BranchOnCond");
   VPValue *CondOfEarlyExitingVPBB =
       EarlyExitingVPBB->getTerminator()->getOperand(0);
   auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
@@ -3662,8 +3665,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
     return;
 
 #ifndef NDEBUG
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 #endif
 
   SmallVector<VPValue *> VPValues;
@@ -4009,8 +4011,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R) ||
-        match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+    if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
       continue;
 
     if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5e7f19f..1c4adfc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -259,8 +259,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
 
 /// Handle non-header-phi recipes.
 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
-  if (match(&R, m_BranchOnCond(m_VPValue())) ||
-      match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+  if (match(&R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount())))
     return;
 
   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 013ea2e..5262af6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -24,6 +24,7 @@
 #define DEBUG_TYPE "loop-vectorize"
 
 using namespace llvm;
+using namespace VPlanPatternMatch;
 
 namespace {
 class VPlanVerifier {
@@ -198,7 +199,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           }
           // EVLIVIncrement is only used by EVLIV & BranchOnCount.
           // Having more than two users is unexpected.
-          using namespace llvm::VPlanPatternMatch;
           if (I->getOpcode() != VPInstruction::Broadcast &&
               I->getNumUsers() != 1 &&
               (I->getNumUsers() != 2 ||
@@ -479,8 +479,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
   }
 
   auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
-  if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
-                    LastInst->getOpcode() != VPInstruction::BranchOnCond)) {
+  if (!match(LastInst, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) {
     errs() << "VPlan vector loop exit must end with BranchOnCount or "
               "BranchOnCond VPInstruction\n";
     return false;
@@ -490,8 +489,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
 }
 
 bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(const_cast<VPlan &>(Plan));
+  VPDominatorTree VPDT(const_cast<VPlan &>(Plan));
   VPTypeAnalysis TypeInfo(Plan);
   VPlanVerifier Verifier(VPDT, TypeInfo, VerifyLate);
   return Verifier.verify(Plan);
diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index f4ba0eb..d0c6853 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockIndexer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockIndexer::visit(BufferExtents &) { return Error::success(); }
 
@@ -89,6 +89,3 @@ Error BlockIndexer::flush() {
   CurrentBlock.WallclockTime = nullptr;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockPrinter.cpp b/llvm/lib/XRay/BlockPrinter.cpp
index 63a60c3..d85be5b 100644
--- a/llvm/lib/XRay/BlockPrinter.cpp
+++ b/llvm/lib/XRay/BlockPrinter.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockPrinter.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockPrinter::visit(BufferExtents &R) {
   OS << "\n[New Block]\n";
@@ -108,6 +108,3 @@ Error BlockPrinter::visit(EndBufferRecord &R) {
     auto E = RP.visit(R);
     return E;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockVerifier.cpp b/llvm/lib/XRay/BlockVerifier.cpp
index 99f255e..e39f6b6 100644
--- a/llvm/lib/XRay/BlockVerifier.cpp
+++ b/llvm/lib/XRay/BlockVerifier.cpp
@@ -10,19 +10,18 @@
 
 #include <bitset>
 
-namespace llvm {
-namespace xray {
-namespace {
+using namespace llvm;
+using namespace llvm::xray;
 
-constexpr unsigned long long mask(BlockVerifier::State S) {
+static constexpr unsigned long long mask(BlockVerifier::State S) {
   return 1uLL << static_cast<std::size_t>(S);
 }
 
-constexpr std::size_t number(BlockVerifier::State S) {
+static constexpr std::size_t number(BlockVerifier::State S) {
   return static_cast<std::size_t>(S);
 }
 
-StringRef recordToString(BlockVerifier::State R) {
+static StringRef recordToString(BlockVerifier::State R) {
   switch (R) {
   case BlockVerifier::State::BufferExtents:
     return "BufferExtents";
@@ -53,6 +52,8 @@ StringRef recordToString(BlockVerifier::State R) {
   llvm_unreachable("Unkown state!");
 }
 
+namespace {
+
 struct Transition {
   BlockVerifier::State From;
   std::bitset<number(BlockVerifier::State::StateMax)> ToStates;
@@ -133,7 +134,7 @@ Error BlockVerifier::transition(State To) {
 
   CurrentRecord = To;
   return Error::success();
-} // namespace xray
+}
 
 Error BlockVerifier::visit(BufferExtents &) {
   return transition(State::BufferExtents);
@@ -201,6 +202,3 @@ Error BlockVerifier::verify() {
 }
 
 void BlockVerifier::reset() { CurrentRecord = State::Unknown; }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecordProducer.cpp b/llvm/lib/XRay/FDRRecordProducer.cpp
index 479b710..0f4eed1 100644
--- a/llvm/lib/XRay/FDRRecordProducer.cpp
+++ b/llvm/lib/XRay/FDRRecordProducer.cpp
@@ -10,8 +10,8 @@
 
 #include <cstdint>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -31,8 +31,9 @@ enum MetadataRecordKinds : uint8_t {
   // This is an end marker, used to identify the upper bound for this enum.
   EnumEndMarker,
 };
+} // namespace
 
-Expected<std::unique_ptr<Record>>
+static Expected<std::unique_ptr<Record>>
 metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
 
   if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
@@ -72,12 +73,10 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   llvm_unreachable("Unhandled MetadataRecordKinds enum value");
 }
 
-constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+static constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
   return FirstByte & 0x01u;
 }
 
-} // namespace
-
 Expected<std::unique_ptr<Record>>
 FileBasedRecordProducer::findNextBufferExtent() {
   // We seek one byte at a time until we find a suitable buffer extents metadata
@@ -193,6 +192,3 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   assert(R != nullptr);
   return std::move(R);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecords.cpp b/llvm/lib/XRay/FDRRecords.cpp
index ff315d3..a18f733 100644
--- a/llvm/lib/XRay/FDRRecords.cpp
+++ b/llvm/lib/XRay/FDRRecords.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BufferExtents::apply(RecordVisitor &V) { return V.visit(*this); }
 Error WallclockRecord::apply(RecordVisitor &V) { return V.visit(*this); }
@@ -61,6 +61,3 @@ StringRef Record::kindToString(RecordKind K) {
   }
   return "Unknown";
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index b68e997..991e6e5 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRTraceExpander.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 void TraceExpander::resetCurrentRecord() {
   if (BuildingRecord)
@@ -126,6 +126,3 @@ Error TraceExpander::flush() {
   resetCurrentRecord();
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index fb59125..3e320a6 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -12,8 +12,8 @@
 #include "llvm/XRay/FDRTraceWriter.h"
 #include <tuple>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -37,9 +37,10 @@ template <size_t Index> struct IndexedWriter {
     return 0;
   }
 };
+} // namespace
 
 template <uint8_t Kind, class... Values>
-Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
+static Error writeMetadata(support::endian::Writer &OS, Values &&...Ds) {
   // The first bit in the first byte of metadata records is always set to 1, so
   // we ensure this is the case when we write out the first byte of the record.
   uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
@@ -54,8 +55,6 @@ Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
   return Error::success();
 }
 
-} // namespace
-
 FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
     : OS(O, llvm::endianness::native) {
   // We need to re-construct a header, by writing the fields we care about for
@@ -146,6 +145,3 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
   OS.write(R.delta());
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FileHeaderReader.cpp b/llvm/lib/XRay/FileHeaderReader.cpp
index 6b6daf9..681cef7 100644
--- a/llvm/lib/XRay/FileHeaderReader.cpp
+++ b/llvm/lib/XRay/FileHeaderReader.cpp
@@ -7,12 +7,13 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FileHeaderReader.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 // Populates the FileHeader reference by reading the first 32 bytes of the file.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr) {
+Expected<XRayFileHeader>
+xray::readBinaryFormatHeader(DataExtractor &HeaderExtractor,
+                             uint64_t &OffsetPtr) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -68,6 +69,3 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   OffsetPtr += 16;
   return std::move(FileHeader);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/LogBuilderConsumer.cpp b/llvm/lib/XRay/LogBuilderConsumer.cpp
index ffb49f9..f0fc336 100644
--- a/llvm/lib/XRay/LogBuilderConsumer.cpp
+++ b/llvm/lib/XRay/LogBuilderConsumer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecordConsumer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error LogBuilderConsumer::consume(std::unique_ptr<Record> R) {
   if (!R)
@@ -32,6 +32,3 @@ Error PipelineConsumer::consume(std::unique_ptr<Record> R) {
     Result = joinErrors(std::move(Result), R->apply(*V));
   return Result;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index 1b340e5..ecb767b 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -18,8 +18,8 @@
 #include "llvm/XRay/Trace.h"
 #include <memory>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Profile::Profile(const Profile &O) {
   // We need to re-create all the tries from the original (O), into the current
@@ -46,6 +46,7 @@ struct BlockHeader {
   uint32_t Number;
   uint64_t Thread;
 };
+} // namespace
 
 static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
                                              uint64_t &Offset) {
@@ -115,8 +116,6 @@ static Expected<Profile::Data> readData(DataExtractor &Extractor,
   return D;
 }
 
-} // namespace
-
 Error Profile::addBlock(Block &&B) {
   if (B.PathData.empty())
     return make_error<StringError>(
@@ -189,7 +188,7 @@ Profile::PathID Profile::internPath(ArrayRef<FuncID> P) {
   return Node->ID;
 }
 
-Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByThread(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   using PathDataMapPtr = std::unique_ptr<PathDataMap>;
@@ -228,7 +227,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByStack(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   PathDataMap PathData;
@@ -258,7 +257,7 @@ Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Expected<Profile> loadProfile(StringRef Filename) {
+Expected<Profile> xray::loadProfile(StringRef Filename) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
   if (!FdOrErr)
     return FdOrErr.takeError();
@@ -322,7 +321,7 @@ struct StackEntry {
 
 } // namespace
 
-Expected<Profile> profileFromTrace(const Trace &T) {
+Expected<Profile> xray::profileFromTrace(const Trace &T) {
   Profile P;
 
   // The implementation of the algorithm re-creates the execution of
@@ -397,6 +396,3 @@ Expected<Profile> profileFromTrace(const Trace &T) {
 
   return P;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordInitializer.cpp b/llvm/lib/XRay/RecordInitializer.cpp
index 68ab3db..83d5f14 100644
--- a/llvm/lib/XRay/RecordInitializer.cpp
+++ b/llvm/lib/XRay/RecordInitializer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordInitializer::visit(BufferExtents &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
@@ -426,6 +426,3 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordPrinter.cpp b/llvm/lib/XRay/RecordPrinter.cpp
index 32d4210..b9b7a16 100644
--- a/llvm/lib/XRay/RecordPrinter.cpp
+++ b/llvm/lib/XRay/RecordPrinter.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/Support/FormatVariadic.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordPrinter::visit(BufferExtents &R) {
   OS << formatv("<Buffer: size = {0} bytes>", R.size()) << Delim;
@@ -103,6 +103,3 @@ Error RecordPrinter::visit(FunctionRecord &R) {
   OS << Delim;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 74515b1..14a3f01 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -29,11 +29,9 @@ using namespace llvm;
 using namespace llvm::xray;
 using llvm::yaml::Input;
 
-namespace {
-
-Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
-                         XRayFileHeader &FileHeader,
-                         std::vector<XRayRecord> &Records) {
+static Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
+                                XRayFileHeader &FileHeader,
+                                std::vector<XRayRecord> &Records) {
   if (Data.size() < 32)
     return make_error<StringError>(
         "Not enough bytes for an XRay log.",
@@ -265,8 +263,9 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
 /// what FunctionRecord instances use, and we no longer need to include the CPU
 /// id in the CustomEventRecord.
 ///
-Error loadFDRLog(StringRef Data, bool IsLittleEndian,
-                 XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
+static Error loadFDRLog(StringRef Data, bool IsLittleEndian,
+                        XRayFileHeader &FileHeader,
+                        std::vector<XRayRecord> &Records) {
 
   if (Data.size() < 32)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -348,8 +347,8 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
   return Error::success();
 }
 
-Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
-                  std::vector<XRayRecord> &Records) {
+static Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
   YAMLXRayTrace Trace;
   Input In(Data);
   In >> Trace;
@@ -376,7 +375,6 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
                  });
   return Error::success();
 }
-} // namespace
 
 Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);