75 files changed, 1586 insertions, 783 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 4969528..dd98b62 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1659,7 +1659,6 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::aarch64_sve_convert_from_svbool:
   case Intrinsic::wasm_alltrue:
   case Intrinsic::wasm_anytrue:
-  case Intrinsic::wasm_dot:
   // WebAssembly float semantics are always known
   case Intrinsic::wasm_trunc_signed:
   case Intrinsic::wasm_trunc_unsigned:
@@ -3990,30 +3989,6 @@ static Constant *ConstantFoldFixedVectorCall(
     }
     return ConstantVector::get(Result);
   }
-  case Intrinsic::wasm_dot: {
-    unsigned NumElements =
-        cast<FixedVectorType>(Operands[0]->getType())->getNumElements();
-
-    assert(NumElements == 8 && Result.size() == 4 &&
-           "wasm dot takes i16x8 and produces i32x4");
-    assert(Ty->isIntegerTy());
-    int32_t MulVector[8];
-
-    for (unsigned I = 0; I < NumElements; ++I) {
-      ConstantInt *Elt0 =
-          cast<ConstantInt>(Operands[0]->getAggregateElement(I));
-      ConstantInt *Elt1 =
-          cast<ConstantInt>(Operands[1]->getAggregateElement(I));
-
-      MulVector[I] = Elt0->getSExtValue() * Elt1->getSExtValue();
-    }
-    for (unsigned I = 0; I < Result.size(); I++) {
-      int32_t IAdd = MulVector[I * 2] + MulVector[I * 2 + 1];
-      Result[I] = ConstantInt::get(Ty, IAdd);
-    }
-
-    return ConstantVector::get(Result);
-  }
   default:
     break;
   }
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f1473b2..256befa 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
   for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
        ++SrcI) {
     if (SrcI->mayReadOrWriteMemory()) {
-      for (inst_iterator DstI = SrcI, DstE = inst_end(F);
-           DstI != DstE; ++DstI) {
+      for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE;
+           ++DstI) {
         if (DstI->mayReadOrWriteMemory()) {
           OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n";
           OS << "  da analyze - ";
@@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
             // Normalize negative direction vectors if required by clients.
             if (NormalizeResults && D->normalize(&SE))
-                OS << "normalized - ";
+              OS << "normalized - ";
             D->dump(OS);
             for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
               if (D->isSplitable(Level)) {
@@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
 void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
                                           const Module *) const {
-  dumpExampleDependence(OS, info.get(),
-                        getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+  dumpExampleDependence(
+      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
 }
 
 PreservedAnalyses
@@ -249,33 +249,26 @@ bool Dependence::isInput() const {
   return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an output dependence.
 bool Dependence::isOutput() const {
   return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if this is an flow (aka true)  dependence.
 bool Dependence::isFlow() const {
   return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an anti dependence.
 bool Dependence::isAnti() const {
   return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
 // Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level) const {
-  return false;
-}
-
+bool Dependence::isScalar(unsigned level) const { return false; }
 
 //===----------------------------------------------------------------------===//
 // FullDependence methods
@@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
     DV[Level - 1].Direction = RevDirection;
     // Reverse the dependence distance as well.
     if (DV[Level - 1].Distance != nullptr)
-      DV[Level - 1].Distance =
-          SE->getNegativeSCEV(DV[Level - 1].Distance);
+      DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance);
   }
 
   LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n";
@@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const {
   return DV[Level - 1].Direction;
 }
 
-
 // Returns the distance (or NULL) associated with a particular level.
 const SCEV *FullDependence::getDistance(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Distance;
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
@@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const {
   return DV[Level - 1].Scalar;
 }
 
-
 // Returns true if peeling the first iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelFirst(unsigned Level) const {
@@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const {
   return DV[Level - 1].PeelFirst;
 }
 
-
 // Returns true if peeling the last iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelLast(unsigned Level) const {
@@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const {
   return DV[Level - 1].PeelLast;
 }
 
-
 // Returns true if splitting this loop will break the dependence.
 bool FullDependence::isSplitable(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Splitable;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo::Constraint methods
 
@@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const {
   return A;
 }
 
-
 // If constraint is a point <X, Y>, returns Y.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getY() const {
@@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns A.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getA() const {
@@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const {
   return A;
 }
 
-
 // If constraint is a line AX + BY = C, returns B.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getB() const {
@@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns C.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getC() const {
@@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const {
   return C;
 }
 
-
 // If constraint is a distance, returns D.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getD() const {
@@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const {
   return SE->getNegativeSCEV(C);
 }
 
-
 // Returns the loop associated with this constraint.
 const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
   assert((Kind == Distance || Kind == Line || Kind == Point) &&
@@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   else if (isPoint())
     OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
   else if (isDistance())
-    OS << " Distance is " << *getD() <<
-      " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+    OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB()
+       << "*Y = " << *getC() << ")\n";
   else if (isLine())
-    OS << " Line is " << *getA() << "*X + " <<
-      *getB() << "*Y = " << *getC() << "\n";
+    OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC()
+       << "\n";
   else
     llvm_unreachable("unknown constraint type in Constraint::dump");
 }
 #endif
 
-
 // Updates X with the intersection
 // of the Constraints X and Y. Returns true if X has changed.
 // Corresponds to Figure 4 from the paper
@@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
       const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
       const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
       const SCEVConstant *C1A2_C2A1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
       const SCEVConstant *C1B2_C2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
       const SCEVConstant *A1B2_A2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
       const SCEVConstant *A2B1_A1B2 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
-      if (!C1B2_C2B1 || !C1A2_C2A1 ||
-          !A1B2_A2B1 || !A2B1_A1B2)
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+      if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2)
         return false;
       APInt Xtop = C1B2_C2B1->getAPInt();
       APInt Xbot = A1B2_A2B1->getAPInt();
@@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
         ++DeltaSuccesses;
         return true;
       }
-      if (const SCEVConstant *CUB =
-          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+      if (const SCEVConstant *CUB = collectConstantUpperBound(
+              X->getAssociatedLoop(), Prod1->getType())) {
         const APInt &UpperBound = CUB->getAPInt();
         LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
@@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
           return true;
         }
       }
-      X->setPoint(SE->getConstant(Xq),
-                  SE->getConstant(Yq),
+      X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq),
                   X->getAssociatedLoop());
       ++DeltaSuccesses;
       return true;
@@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo methods
 
@@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const {
 // tbaa, non-overlapping regions etc), then it is known there is no dependecy.
 // Otherwise the underlying objects are checked to see if they point to
 // different identifiable objects.
-static AliasResult underlyingObjectsAlias(AAResults *AA,
-                                          const DataLayout &DL,
+static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL,
                                           const MemoryLocation &LocA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
@@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA,
 
 // Returns true if the load or store can be analyzed. Atomic and volatile
 // operations have properties which this analysis does not understand.
-static
-bool isLoadOrStore(const Instruction *I) {
+static bool isLoadOrStore(const Instruction *I) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->isUnordered();
   else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) {
   return false;
 }
 
-
 // Examines the loop nesting of the Src and Dst
 // instructions and establishes their shared loops. Sets the variables
 // CommonLevels, SrcLevels, and MaxLevels.
@@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
   MaxLevels -= CommonLevels;
 }
 
-
 // Given one of the loops containing the source, return
 // its level index in our numbering scheme.
 unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
   return SrcLoop->getLoopDepth();
 }
 
-
 // Given one of the loops containing the destination,
 // return its level index in our numbering scheme.
 unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
@@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
     return D;
 }
 
-
 // Returns true if Expression is loop invariant in LoopNest.
 bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
                                      const Loop *LoopNest) const {
@@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
   return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop());
 }
 
-
-
 // Finds the set of loops from the LoopNest that
 // have a level <= CommonLevels and are referred to by the SCEV Expression.
 void DependenceInfo::collectCommonLoops(const SCEV *Expression,
@@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() > widestWidthSeen) {
@@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     }
   }
 
-
   assert(widestWidthSeen > 0);
 
   // Now extend each pair to the widest seen.
@@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() < widestWidthSeen)
@@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
   return checkSubscript(Dst, LoopNest, Loops, false);
 }
 
-
 // Examines the subscript pair (the Src and Dst SCEVs)
 // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
 // Collects the associated loops in a set.
@@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
     return Subscript::ZIV;
   if (N == 1)
     return Subscript::SIV;
-  if (N == 2 && (SrcLoops.count() == 0 ||
-                 DstLoops.count() == 0 ||
+  if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 ||
                  (SrcLoops.count() == 1 && DstLoops.count() == 1)))
     return Subscript::RDIV;
   return Subscript::MIV;
 }
 
-
 // A wrapper around SCEV::isKnownPredicate.
 // Looks for cases where we're interested in comparing for equality.
 // If both X and Y have been identically sign or zero extended,
@@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
 // involving symbolics.
 bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
                                       const SCEV *Y) const {
-  if (Pred == CmpInst::ICMP_EQ ||
-      Pred == CmpInst::ICMP_NE) {
-    if ((isa<SCEVSignExtendExpr>(X) &&
-         isa<SCEVSignExtendExpr>(Y)) ||
-        (isa<SCEVZeroExtendExpr>(X) &&
-         isa<SCEVZeroExtendExpr>(Y))) {
+  if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+    if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) ||
+        (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) {
       const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X);
       const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y);
       const SCEV *Xop = CX->getOperand();
@@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
   }
 }
 
-/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1))
+/// Compare to see if S is less than Size, using
+///
+///    isKnownNegative(S - max(Size, 1))
+///
 /// with some extra checking if S is an AddRec and we can prove less-than using
 /// the loop bounds.
 bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
@@ -1178,7 +1142,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const {
   return nullptr;
 }
 
-
 // Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
 // If the cast fails, returns NULL.
 const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
@@ -1188,7 +1151,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
   return nullptr;
 }
 
-
 // testZIV -
 // When we have a pair of subscripts of the form [c1] and [c2],
 // where c1 and c2 are both loop invariant, we attack it using
@@ -1218,7 +1180,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
   return false; // possibly dependent
 }
 
-
 // strongSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.1
 //
@@ -1270,9 +1231,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
     const SCEV *AbsDelta =
-      SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
     const SCEV *AbsCoeff =
-      SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
     const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
     if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
       // Distance greater than trip count - no dependence
@@ -1286,7 +1247,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
     APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt();
     APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt();
-    APInt Distance  = ConstDelta; // these need to be initialized
+    APInt Distance = ConstDelta; // these need to be initialized
     APInt Remainder = ConstDelta;
     APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
     LLVM_DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
@@ -1307,29 +1268,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     else
       Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else if (Delta->isZero()) {
+  } else if (Delta->isZero()) {
     // since 0/X == 0
     Result.DV[Level].Distance = Delta;
     NewConstraint.setDistance(Delta, CurLoop);
     Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else {
+  } else {
     if (Coeff->isOne()) {
       LLVM_DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
       Result.DV[Level].Distance = Delta; // since X/1 == X
       NewConstraint.setDistance(Delta, CurLoop);
-    }
-    else {
+    } else {
       Result.Consistent = false;
-      NewConstraint.setLine(Coeff,
-                            SE->getNegativeSCEV(Coeff),
+      NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff),
                             SE->getNegativeSCEV(Delta), CurLoop);
     }
 
     // maybe we can get a useful direction
-    bool DeltaMaybeZero     = !SE->isKnownNonZero(Delta);
+    bool DeltaMaybeZero = !SE->isKnownNonZero(Delta);
     bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
     bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
     bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
@@ -1353,7 +1310,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   return false;
 }
 
-
 // weakCrossingSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1447,8 +1403,8 @@ bool DependenceInfo::weakCrossingSIVtest(
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
-    const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
-                                    ConstantTwo);
+    const SCEV *ML =
+        SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo);
     LLVM_DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
     if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
       // Delta too big, no dependence
@@ -1498,7 +1454,6 @@ bool DependenceInfo::weakCrossingSIVtest(
   return false;
 }
 
-
 // Kirch's algorithm, from
 //
 //        Optimizing Supercompilers for Supercomputers
@@ -1519,9 +1474,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
   APInt R = G0;
   APInt::sdivrem(G0, G1, Q, R);
   while (R != 0) {
+    // clang-format off
     APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
     APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
     G0 = G1; G1 = R;
+    // clang-format on
     APInt::sdivrem(G0, G1, Q, R);
   }
   G = G1;
@@ -1543,8 +1500,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q;
   else
     return Q - 1;
@@ -1556,8 +1512,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q + 1;
   else
     return Q;
@@ -1733,17 +1688,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
 }
 
-
 // Return true if the divisor evenly divides the dividend.
-static
-bool isRemainderZero(const SCEVConstant *Dividend,
-                     const SCEVConstant *Divisor) {
+static bool isRemainderZero(const SCEVConstant *Dividend,
+                            const SCEVConstant *Divisor) {
   const APInt &ConstDividend = Dividend->getAPInt();
   const APInt &ConstDivisor = Divisor->getAPInt();
   return ConstDividend.srem(ConstDivisor) == 0;
 }
 
-
 // weakZeroSrcSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1807,11 +1759,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1853,7 +1805,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   return false;
 }
 
-
 // weakZeroDstSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1916,11 +1867,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1962,7 +1913,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   return false;
 }
 
-
 // exactRDIVtest - Tests the RDIV subscript pair for dependence.
 // Things of the form [c1 + a*i] and [c2 + b*j],
 // where i and j are induction variable, c1 and c2 are loop invariant,
@@ -2084,7 +2034,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return TL.sgt(TU);
 }
 
-
 // symbolicRDIVtest -
 // In Section 4.5 of the Practical Dependence Testing paper,the authors
 // introduce a special case of Banerjee's Inequalities (also called the
@@ -2167,8 +2116,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
           return true;
         }
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 >= 0 && a2 <= 0
       if (N1 && N2) {
         // make sure that c2 - c1 <= a1*N1 - a2*N2
@@ -2187,8 +2135,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         return true;
       }
     }
-  }
-  else if (SE->isKnownNonPositive(A1)) {
+  } else if (SE->isKnownNonPositive(A1)) {
     if (SE->isKnownNonNegative(A2)) {
       // a1 <= 0 && a2 >= 0
       if (N1 && N2) {
@@ -2207,8 +2154,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         ++SymbolicRDIVindependence;
         return true;
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 <= 0 && a2 <= 0
       if (N1) {
         // make sure that a1*N1 <= c2 - c1
@@ -2233,7 +2179,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
   return false;
 }
 
-
 // testSIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
 // where i is an induction variable, c1 and c2 are loop invariant, and a1 and
@@ -2260,17 +2205,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     Level = mapSrcLoop(CurLoop);
     bool disproven;
     if (SrcCoeff == DstCoeff)
-      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                                Level, Result, NewConstraint);
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                                Result, NewConstraint);
     else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
       disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
                                       Level, Result, NewConstraint, SplitIter);
     else
       disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
                                Level, Result, NewConstraint);
-    return disproven ||
-      gcdMIVtest(Src, Dst, Result) ||
-      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+    return disproven || gcdMIVtest(Src, Dst, Result) ||
+           symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+                            CurLoop);
   }
   if (SrcAddRec) {
     const SCEV *SrcConst = SrcAddRec->getStart();
@@ -2278,9 +2223,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *DstConst = Dst;
     const Loop *CurLoop = SrcAddRec->getLoop();
     Level = mapSrcLoop(CurLoop);
-    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                              Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   if (DstAddRec) {
     const SCEV *DstConst = DstAddRec->getStart();
@@ -2288,15 +2233,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *SrcConst = Src;
     const Loop *CurLoop = DstAddRec->getLoop();
     Level = mapDstLoop(CurLoop);
-    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
-                              CurLoop, Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   llvm_unreachable("SIV test expected at least one AddRec");
   return false;
 }
 
-
 // testRDIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
 // where i and j are induction variables, c1 and c2 are loop invariant,
@@ -2333,46 +2277,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
     DstConst = DstAddRec->getStart();
     DstCoeff = DstAddRec->getStepRecurrence(*SE);
     DstLoop = DstAddRec->getLoop();
-  }
-  else if (SrcAddRec) {
+  } else if (SrcAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
       SrcConst = tmpAddRec->getStart();
       SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
       SrcLoop = tmpAddRec->getLoop();
       DstConst = Dst;
       DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
       DstLoop = SrcAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else if (DstAddRec) {
+  } else if (DstAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
       DstConst = tmpAddRec->getStart();
       DstCoeff = tmpAddRec->getStepRecurrence(*SE);
       DstLoop = tmpAddRec->getLoop();
       SrcConst = Src;
       SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
       SrcLoop = DstAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else
+  } else
     llvm_unreachable("RDIV expected at least one AddRec");
-  return exactRDIVtest(SrcCoeff, DstCoeff,
-                       SrcConst, DstConst,
-                       SrcLoop, DstLoop,
+  return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop,
                        Result) ||
-    gcdMIVtest(Src, Dst, Result) ||
-    symbolicRDIVtest(SrcCoeff, DstCoeff,
-                     SrcConst, DstConst,
-                     SrcLoop, DstLoop);
+         gcdMIVtest(Src, Dst, Result) ||
+         symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop,
+                          DstLoop);
 }
 
-
 // Tests the single-subscript MIV pair (Src and Dst) for dependence.
 // Return true if dependence disproved.
 // Can sometimes refine direction vectors.
@@ -2383,7 +2318,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
   LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   Result.Consistent = false;
   return gcdMIVtest(Src, Dst, Result) ||
-    banerjeeMIVtest(Src, Dst, Loops, Result);
+         banerjeeMIVtest(Src, Dst, Loops, Result);
 }
 
 // Given a product, e.g., 10*X*Y, returns the first constant operand,
@@ -2428,7 +2363,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   const SCEV *Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2446,7 +2381,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   Coefficients = Dst;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2468,16 +2403,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
       if (isa<SCEVConstant>(Operand)) {
         assert(!Constant && "Surprised to find multiple constants");
         Constant = cast<SCEVConstant>(Operand);
-      }
-      else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+      } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
         // Search for constant operand to participate in GCD;
         // If none found; return false.
         std::optional<APInt> ConstOp = getConstantPart(Product);
         if (!ConstOp)
           return false;
         ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs());
-      }
-      else
+      } else
         return false;
     }
   }
@@ -2512,7 +2445,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   bool Improved = false;
   Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     Coefficients = AddRec->getStart();
     const Loop *CurLoop = AddRec->getLoop();
     RunningGCD = ExtraGCD;
@@ -2578,7 +2511,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // banerjeeMIVtest -
 // Use Banerjee's Inequalities to test an MIV subscript pair.
@@ -2652,8 +2584,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
   if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
     // Explore the direction vector hierarchy.
     unsigned DepthExpanded = 0;
-    unsigned NewDeps = exploreDirections(1, A, B, Bound,
-                                         Loops, DepthExpanded, Delta);
+    unsigned NewDeps =
+        exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta);
     if (NewDeps > 0) {
       bool Improved = false;
       for (unsigned K = 1; K <= CommonLevels; ++K) {
@@ -2670,23 +2602,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
       }
       if (Improved)
         ++BanerjeeSuccesses;
-    }
-    else {
+    } else {
       ++BanerjeeIndependence;
       Disproved = true;
     }
-  }
-  else {
+  } else {
     ++BanerjeeIndependence;
     Disproved = true;
   }
-  delete [] Bound;
-  delete [] A;
-  delete [] B;
+  delete[] Bound;
+  delete[] A;
+  delete[] B;
   return Disproved;
 }
 
-
 // Hierarchically expands the direction vector
 // search space, combining the directions of discovered dependences
 // in the DirSet field of Bound. Returns the number of distinct
@@ -2788,27 +2717,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
 
     // test bounds for <, *, *, ...
     if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // Test bounds for =, *, *, ...
     if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // test bounds for >, *, *, ...
     if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     Bound[Level].Direction = Dependence::DVEntry::ALL;
     return NewDeps;
-  }
-  else
-    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+  } else
+    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                             Delta);
 }
 
-
 // Returns true iff the current bounds are plausible.
 bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
                                 BoundInfo *Bound, const SCEV *Delta) const {
@@ -2822,7 +2750,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
   return true;
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the * direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2840,17 +2767,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
                                    BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
-    Bound[K].Lower[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
-                     Bound[K].Iterations);
-    Bound[K].Upper[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
-                     Bound[K].Iterations);
-  }
-  else {
+    Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations);
+    Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations);
+  } else {
     // If the difference is 0, we won't need to know the number of iterations.
     if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
       Bound[K].Lower[Dependence::DVEntry::ALL] =
@@ -2861,7 +2787,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the = direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2879,18 +2804,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
     const SCEV *NegativePart = getNegativePart(Delta);
     Bound[K].Lower[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(NegativePart, Bound[K].Iterations);
+        SE->getMulExpr(NegativePart, Bound[K].Iterations);
     const SCEV *PositivePart = getPositivePart(Delta);
     Bound[K].Upper[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(PositivePart, Bound[K].Iterations);
-  }
-  else {
+        SE->getMulExpr(PositivePart, Bound[K].Iterations);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
@@ -2903,7 +2829,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the < direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2919,35 +2844,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::LT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     Bound[K].Lower[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+        SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     Bound[K].Upper[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
-  }
-  else {
+        SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the > direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2963,45 +2888,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::GT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     Bound[K].Lower[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+        SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     Bound[K].Upper[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
-  }
-  else {
+        SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
-    const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    const SCEV *NegPart =
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
-    const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    const SCEV *PosPart =
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
   }
 }
 
-
 // X^+ = max(X, 0)
 const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const {
   return SE->getSMaxExpr(X, SE->getZero(X->getType()));
 }
 
-
 // X^- = min(X, 0)
 const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const {
   return SE->getSMinExpr(X, SE->getZero(X->getType()));
 }
 
-
 // Walks through the subscript,
 // collecting each coefficient, the associated loop bounds,
 // and recording its positive and negative parts for later use.
@@ -3046,7 +2971,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
   return CI;
 }
 
-
 // Looks through all the bounds info and
 // computes the lower bound given the current direction settings
 // at each level. If the lower bound for any level is -inf,
@@ -3062,7 +2986,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 // Looks through all the bounds info and
 // computes the upper bound given the current direction settings
 // at each level. If the upper bound at any level is +inf,
@@ -3078,7 +3001,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Constraint manipulation for Delta test.
 
@@ -3098,7 +3020,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr,
   return findCoefficient(AddRec->getStart(), TargetLoop);
 }
 
-
 // Given a linear SCEV,
 // return the SCEV given by zeroing out the coefficient
 // corresponding to the specified loop.
@@ -3112,12 +3033,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr,
   if (AddRec->getLoop() == TargetLoop)
     return AddRec->getStart();
   return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
-                           AddRec->getStepRecurrence(*SE),
-                           AddRec->getLoop(),
+                           AddRec->getStepRecurrence(*SE), AddRec->getLoop(),
                            AddRec->getNoWrapFlags());
 }
 
-
 // Given a linear SCEV Expr,
 // return the SCEV given by adding some Value to the
 // coefficient corresponding to the specified TargetLoop.
@@ -3128,17 +3047,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
                                              const SCEV *Value) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec) // create a new addRec
-    return SE->getAddRecExpr(Expr,
-                             Value,
-                             TargetLoop,
+    return SE->getAddRecExpr(Expr, Value, TargetLoop,
                              SCEV::FlagAnyWrap); // Worst case, with no info.
   if (AddRec->getLoop() == TargetLoop) {
     const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
     if (Sum->isZero())
       return AddRec->getStart();
-    return SE->getAddRecExpr(AddRec->getStart(),
-                             Sum,
-                             AddRec->getLoop(),
+    return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(),
                              AddRec->getNoWrapFlags());
   }
   if (SE->isLoopInvariant(AddRec, TargetLoop))
@@ -3149,7 +3064,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
       AddRec->getNoWrapFlags());
 }
 
-
 // Review the constraints, looking for opportunities
 // to simplify a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3178,7 +3092,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
   return Result;
 }
 
-
 // Attempt to propagate a distance
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3204,7 +3117,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a line
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3224,22 +3136,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   if (A->isZero()) {
     const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Bconst || !Cconst) return false;
+    if (!Bconst || !Cconst)
+      return false;
     APInt Beta = Bconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivB = Charlie.sdiv(Beta);
     assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
     const SCEV *AP_K = findCoefficient(Dst, CurLoop);
-    //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Dst = zeroCoefficient(Dst, CurLoop);
     if (!findCoefficient(Src, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (B->isZero()) {
+  } else if (B->isZero()) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3249,11 +3161,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Src = zeroCoefficient(Src, CurLoop);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+  } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3264,8 +3176,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Dst = addToCoefficient(Dst, CurLoop, A_K);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else {
+  } else {
     // paper is incorrect here, or perhaps just misleading
     const SCEV *A_K = findCoefficient(Src, CurLoop);
     Src = SE->getMulExpr(Src, A);
@@ -3281,7 +3192,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a point
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3302,7 +3212,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Update direction vector entry based on the current constraint.
 void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
                                      const Constraint &CurConstraint) const {
@@ -3322,34 +3231,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
     if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else if (CurConstraint.isLine()) {
+  } else if (CurConstraint.isLine()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     // direction should be accurate
-  }
-  else if (CurConstraint.isPoint()) {
+  } else if (CurConstraint.isPoint()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     unsigned NewDirection = Dependence::DVEntry::NONE;
-    if (!isKnownPredicate(CmpInst::ICMP_NE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if X may be = Y
       NewDirection |= Dependence::DVEntry::EQ;
-    if (!isKnownPredicate(CmpInst::ICMP_SLE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be > X
       NewDirection |= Dependence::DVEntry::LT;
-    if (!isKnownPredicate(CmpInst::ICMP_SGE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be < X
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else
+  } else
     llvm_unreachable("constraint has unexpected kind");
 }
 
@@ -3425,7 +3328,7 @@ bool DependenceInfo::tryDelinearizeFixedSize(
         dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
     assert(SrcBase && DstBase && SrcBase == DstBase &&
            "expected src and dst scev unknowns to be equal");
-    });
+  });
 
   SmallVector<int, 4> SrcSizes;
   SmallVector<int, 4> DstSizes;
@@ -3737,9 +3640,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
     LLVM_DEBUG(dbgs() << "    subscript " << P << "\n");
@@ -3814,18 +3716,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
       ++NonlinearSubscriptPairs;
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
     } else if (Pair[SI].Classification == Subscript::ZIV) {
       // always separable
       Separable.set(SI);
-    }
-    else {
+    } else {
       // SIV, RDIV, or MIV, so check for coupled group
       bool Done = true;
       for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
@@ -3843,8 +3742,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         if (Pair[SI].Group.count() == 1) {
           Separable.set(SI);
           ++SeparableSubscriptPairs;
-        }
-        else {
+        } else {
           Coupled.set(SI);
           ++CoupledSubscriptPairs;
         }
@@ -3950,10 +3848,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                           Constraints, Result.Consistent)) {
               LLVM_DEBUG(dbgs() << "\t    Changed\n");
               ++DeltaPropagations;
-              Pair[SJ].Classification =
-                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
-                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
-                             Pair[SJ].Loops);
+              Pair[SJ].Classification = classifyPair(
+                  Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst,
+                  LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops);
               switch (Pair[SJ].Classification) {
               case Subscript::ZIV:
                 LLVM_DEBUG(dbgs() << "ZIV\n");
@@ -3995,8 +3892,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
           LLVM_DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
             return nullptr;
-        }
-        else
+        } else
           llvm_unreachable("expected only MIV subscripts at this point");
       }
 
@@ -4052,8 +3948,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         break;
       }
     }
-  }
-  else {
+  } else {
     // On the other hand, if all directions are equal and there's no
     // loop-independent dependence possible, then no dependence exists.
     bool AllEqual = true;
@@ -4158,9 +4053,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
   }
@@ -4172,15 +4066,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   for (unsigned SI = 0; SI < Pairs; ++SI) {
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
-    }
-    else if (Pair[SI].Classification == Subscript::ZIV)
+    } else if (Pair[SI].Classification == Subscript::ZIV)
       Separable.set(SI);
     else {
       // SIV, RDIV, or MIV, so check for coupled group
@@ -4214,8 +4105,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     case Subscript::SIV: {
       unsigned Level;
       const SCEV *SplitIter = nullptr;
-      (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
-                     Result, NewConstraint, SplitIter);
+      (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint,
+                    SplitIter);
       if (Level == SplitLevel) {
         assert(SplitIter != nullptr);
         return SplitIter;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index b3b4c37..425ea31 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
   case Intrinsic::powi:
+  case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
   default:
     return OpdIdx == -1;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index f9d7e76..67f526f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1292,12 +1292,10 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
   }
 }
 
-DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
-                                                 const DISubprogram *CalleeSP,
-                                                 bool IsTail,
-                                                 const MCSymbol *PCAddr,
-                                                 const MCSymbol *CallAddr,
-                                                 unsigned CallReg) {
+DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
+    const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg,
+    DIType *AllocSiteTy) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -1306,7 +1304,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
     // Indirect call.
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
-  } else {
+  } else if (CalleeSP) {
     DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
     assert(CalleeDIE && "Could not create DIE for call site entry origin");
     if (AddLinkageNamesToDeclCallOriginsForTuning(DD) &&
@@ -1351,6 +1349,9 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
                     getDwarf5OrGNUAttr(dwarf::DW_AT_call_return_pc), PCAddr);
   }
 
+  if (AllocSiteTy)
+    addType(CallSiteDIE, AllocSiteTy, dwarf::DW_AT_LLVM_alloc_type);
+
   return CallSiteDIE;
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 09be22c..c2f6ca0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -289,7 +289,8 @@ public:
   /// the \p CallReg is set to 0.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
                                  bool IsTail, const MCSymbol *PCAddr,
-                                 const MCSymbol *CallAddr, unsigned CallReg);
+                                 const MCSymbol *CallAddr, unsigned CallReg,
+                                 DIType *AllocSiteTy);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
   /// were collected by the \ref collectCallSiteParameters.
   /// Note: The order of parameters does not matter, since debuggers recognize
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5ae2d2a..c27f100 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -936,6 +936,8 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
       if (MI.hasDelaySlot() && !delaySlotSupported(*&MI))
         return;
 
+      DIType *AllocSiteTy = dyn_cast_or_null<DIType>(MI.getHeapAllocMarker());
+
       // If this is a direct call, find the callee's subprogram.
       // In the case of an indirect call find the register that holds
       // the callee.
@@ -950,23 +952,23 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         PhysRegCalleeOperand =
             PhysRegCalleeOperand && MCOI.OperandType == MCOI::OPERAND_REGISTER;
       }
-      if (!CalleeOp.isGlobal() && !PhysRegCalleeOperand)
-        continue;
 
       unsigned CallReg = 0;
       const DISubprogram *CalleeSP = nullptr;
       const Function *CalleeDecl = nullptr;
       if (PhysRegCalleeOperand) {
-        CallReg = CalleeOp.getReg();
-        if (!CallReg)
-          continue;
-      } else {
+        CallReg = CalleeOp.getReg(); // might be zero
+      } else if (CalleeOp.isGlobal()) {
         CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
-        if (!CalleeDecl || !CalleeDecl->getSubprogram())
-          continue;
-        CalleeSP = CalleeDecl->getSubprogram();
+        if (CalleeDecl)
+          CalleeSP = CalleeDecl->getSubprogram(); // might be nullptr
       }
 
+      // Omit DIE if we can't tell where the call goes *and* we don't want to
+      // add metadata to it.
+      if (CalleeSP == nullptr && CallReg == 0 && AllocSiteTy == nullptr)
+        continue;
+
       // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
 
       bool IsTail = TII->isTailCall(MI);
@@ -1000,7 +1002,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
       DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg);
+          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 9ba1782..0f3ec8b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -132,9 +132,10 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   unsigned i = 0;
   unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
   for (const auto &Arg : CB.args()) {
-    ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i),
-                    i < NumFixedArgs};
+    ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i)};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
+    if (i >= NumFixedArgs)
+      OrigArg.Flags[0].setVarArg();
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
     // might point to function-local memory), we can't meaningfully tail-call.
@@ -301,7 +302,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // double] -> double).
     SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
                            OrigArg.OrigArgIndex, OrigArg.Flags[0],
-                           OrigArg.IsFixed, OrigArg.OrigValue);
+                           OrigArg.OrigValue);
     return;
   }
 
@@ -313,7 +314,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
   for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
     Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
     SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex,
-                           OrigArg.Flags[0], OrigArg.IsFixed);
+                           OrigArg.Flags[0]);
     if (NeedsRegBlock)
       SplitArgs.back().Flags[0].setInConsecutiveRegs();
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d70e96938..7341914 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9390,8 +9390,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   LLVMContext &Context = *DAG.getContext();
   unsigned NumStores = Stores.size();
   unsigned WideNumBits = NumStores * NarrowNumBits;
-  EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
-  if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
+  if (WideNumBits != 16 && WideNumBits != 32 && WideNumBits != 64)
     return SDValue();
 
   // Check if all bytes of the source value that we are looking at are stored
@@ -9445,7 +9444,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
         SourceValue = WideVal;
 
       // Give up if the source value type is smaller than the store size.
-      if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
+      if (SourceValue.getScalarValueSizeInBits() < WideNumBits)
         return SDValue();
     }
 
@@ -9469,6 +9468,8 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
     OffsetMap[Offset] = ByteOffsetFromBase;
   }
 
+  EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
+
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 649a310..bfa72bf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5630,6 +5630,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::FCOPYSIGN:
+  case ISD::FP_EXTEND:
     // No poison except from flags (which is handled above)
     return false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d0815e9..868e2f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2273,9 +2273,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           Flags.setNoExt();
 
         for (unsigned i = 0; i < NumParts; ++i) {
-          Outs.push_back(ISD::OutputArg(Flags,
-                                        Parts[i].getValueType().getSimpleVT(),
-                                        VT, /*isfixed=*/true, 0, 0));
+          Outs.push_back(ISD::OutputArg(
+              Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -2291,9 +2290,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     assert(SwiftError.getFunctionArg() && "Need a swift error argument");
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
     Flags.setSwiftError();
-    Outs.push_back(ISD::OutputArg(
-        Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)),
-        /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0));
+    Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL),
+                                  /*argvt=*/EVT(TLI.getPointerTy(DL)),
+                                  /*origidx=*/1, /*partOffs=*/0));
     // Create SDNode for the swifterror virtual register.
     OutVals.push_back(
         DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
@@ -11124,6 +11123,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL));
       Flags.setOrigAlign(OriginalAlignment);
 
+      if (i >= CLI.NumFixedArgs)
+        Flags.setVarArg();
       if (Args[i].Ty->isPointerTy()) {
         Flags.setPointer();
         Flags.setPointerAddrSpace(
@@ -11246,8 +11247,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // For scalable vectors the scalable part is currently handled
         // by individual targets, so we just use the known minimum size here.
         ISD::OutputArg MyFlags(
-            Flags, Parts[j].getValueType().getSimpleVT(), VT,
-            i < CLI.NumFixedArgs, i,
+            Flags, Parts[j].getValueType().getSimpleVT(), VT, i,
             j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9f525ea..17a01f48 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0));
   }
 }
 
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 6ddb12b..8052773 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -109,6 +109,7 @@ static bool isODRAttribute(uint16_t Attr) {
   case dwarf::DW_AT_specification:
   case dwarf::DW_AT_abstract_origin:
   case dwarf::DW_AT_import:
+  case dwarf::DW_AT_LLVM_alloc_type:
     return true;
   }
   llvm_unreachable("Improper attribute.");
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 78c20a6..79904fc 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -18,24 +18,6 @@ namespace hlsl {
 namespace rootsig {
 
 template <typename T>
-static std::optional<StringRef> getEnumName(const T Value,
-                                            ArrayRef<EnumEntry<T>> Enums) {
-  for (const auto &EnumItem : Enums)
-    if (EnumItem.Value == Value)
-      return EnumItem.Name;
-  return std::nullopt;
-}
-
-template <typename T>
-static raw_ostream &printEnum(raw_ostream &OS, const T Value,
-                              ArrayRef<EnumEntry<T>> Enums) {
-  auto MaybeName = getEnumName(Value, Enums);
-  if (MaybeName)
-    OS << *MaybeName;
-  return OS;
-}
-
-template <typename T>
 static raw_ostream &printFlags(raw_ostream &OS, const T Value,
                                ArrayRef<EnumEntry<T>> Flags) {
   bool FlagSet = false;
@@ -46,9 +28,9 @@ static raw_ostream &printFlags(raw_ostream &OS, const T Value,
       if (FlagSet)
         OS << " | ";
 
-      auto MaybeFlag = getEnumName(T(Bit), Flags);
-      if (MaybeFlag)
-        OS << *MaybeFlag;
+      StringRef MaybeFlag = enumToStringRef(T(Bit), Flags);
+      if (!MaybeFlag.empty())
+        OS << MaybeFlag;
       else
         OS << "invalid: " << Bit;
 
@@ -70,43 +52,42 @@ static const EnumEntry<RegisterType> RegisterNames[] = {
 };
 
 static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
-  printEnum(OS, Reg.ViewType, ArrayRef(RegisterNames));
-  OS << Reg.Number;
+  OS << enumToStringRef(Reg.ViewType, ArrayRef(RegisterNames)) << Reg.Number;
 
   return OS;
 }
 
 static raw_ostream &operator<<(raw_ostream &OS,
                                const llvm::dxbc::ShaderVisibility &Visibility) {
-  printEnum(OS, Visibility, dxbc::getShaderVisibility());
+  OS << enumToStringRef(Visibility, dxbc::getShaderVisibility());
 
   return OS;
 }
 
 static raw_ostream &operator<<(raw_ostream &OS,
                                const llvm::dxbc::SamplerFilter &Filter) {
-  printEnum(OS, Filter, dxbc::getSamplerFilters());
+  OS << enumToStringRef(Filter, dxbc::getSamplerFilters());
 
   return OS;
 }
 
 static raw_ostream &operator<<(raw_ostream &OS,
                                const dxbc::TextureAddressMode &Address) {
-  printEnum(OS, Address, dxbc::getTextureAddressModes());
+  OS << enumToStringRef(Address, dxbc::getTextureAddressModes());
 
   return OS;
 }
 
 static raw_ostream &operator<<(raw_ostream &OS,
                                const dxbc::ComparisonFunc &CompFunc) {
-  printEnum(OS, CompFunc, dxbc::getComparisonFuncs());
+  OS << enumToStringRef(CompFunc, dxbc::getComparisonFuncs());
 
   return OS;
 }
 
 static raw_ostream &operator<<(raw_ostream &OS,
                                const dxbc::StaticBorderColor &BorderColor) {
-  printEnum(OS, BorderColor, dxbc::getStaticBorderColors());
+  OS << enumToStringRef(BorderColor, dxbc::getStaticBorderColors());
 
   return OS;
 }
@@ -119,8 +100,8 @@ static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
 };
 
 static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
-  printEnum(OS, dxil::ResourceClass(llvm::to_underlying(Type)),
-            ArrayRef(ResourceClassNames));
+  OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
+                        ArrayRef(ResourceClassNames));
 
   return OS;
 }
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 6d89fa7..9cf4ed1 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -58,13 +58,6 @@ static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
     {"Sampler", dxil::ResourceClass::Sampler},
 };
 
-static std::optional<StringRef> getResourceName(dxil::ResourceClass Class) {
-  for (const auto &ClassEnum : ResourceClassNames)
-    if (ClassEnum.Value == Class)
-      return ClassEnum.Name;
-  return std::nullopt;
-}
-
 namespace {
 
 // We use the OverloadVisit with std::visit to ensure the compiler catches if a
@@ -133,10 +126,11 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
 
 MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
-  std::optional<StringRef> ResName =
-      getResourceName(dxil::ResourceClass(to_underlying(Descriptor.Type)));
-  assert(ResName && "Provided an invalid Resource Class");
-  SmallString<7> Name({"Root", *ResName});
+  StringRef ResName =
+      enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
+                      ArrayRef(ResourceClassNames));
+  assert(!ResName.empty() && "Provided an invalid Resource Class");
+  SmallString<7> Name({"Root", ResName});
   Metadata *Operands[] = {
       MDString::get(Ctx, Name),
       ConstantAsMetadata::get(
@@ -174,11 +168,12 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
 MDNode *MetadataBuilder::BuildDescriptorTableClause(
     const DescriptorTableClause &Clause) {
   IRBuilder<> Builder(Ctx);
-  std::optional<StringRef> ResName =
-      getResourceName(dxil::ResourceClass(to_underlying(Clause.Type)));
-  assert(ResName && "Provided an invalid Resource Class");
+  StringRef ResName =
+      enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
+                      ArrayRef(ResourceClassNames));
+  assert(!ResName.empty() && "Provided an invalid Resource Class");
   Metadata *Operands[] = {
-      MDString::get(Ctx, *ResName),
+      MDString::get(Ctx, ResName),
       ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
       ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
       ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index e0a9758..7dd0611 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -203,8 +203,10 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options,
   // find machine architecture for this module
   std::string errMsg;
   const Target *march = TargetRegistry::lookupTarget(Triple, errMsg);
-  if (!march)
+  if (!march) {
+    Context.emitError(errMsg);
     return make_error_code(object::object_error::arch_not_found);
+  }
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 5db2642..e09dc94 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -3115,7 +3115,7 @@ void ExportEntry::pushNode(uint64_t offset) {
   }
   State.ChildCount = *Children;
   if (State.ChildCount != 0 && Children + 1 >= Trie.end()) {
-    *E = malformedError("byte for count of childern in export trie data at "
+    *E = malformedError("byte for count of children in export trie data at "
                         "node: 0x" +
                         Twine::utohexstr(offset) +
                         " extends past end of trie data");
@@ -3157,7 +3157,7 @@ void ExportEntry::pushDownUntilBottom() {
     }
     for (const NodeState &node : nodes()) {
       if (node.Start == Trie.begin() + childNodeIndex){
-        *E = malformedError("loop in childern in export trie data at node: 0x" +
+        *E = malformedError("loop in children in export trie data at node: 0x" +
                             Twine::utohexstr(Top.Start - Trie.begin()) +
                             " back to node: 0x" +
                             Twine::utohexstr(childNodeIndex));
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 46084c5..3d688a1 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -4949,6 +4949,21 @@ DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) {
   return *this;
 }
 
+// Returns a result such that:
+// 1. abs(Lo) <= ulp(Hi)/2
+// 2. Hi == RTNE(Hi + Lo)
+// 3. Hi + Lo == X + Y
+//
+// Requires that log2(X) >= log2(Y).
+static std::pair<APFloat, APFloat> fastTwoSum(APFloat X, APFloat Y) {
+  if (!X.isFinite())
+    return {X, APFloat::getZero(X.getSemantics(), /*Negative=*/false)};
+  APFloat Hi = X + Y;
+  APFloat Delta = Hi - X;
+  APFloat Lo = Y - Delta;
+  return {Hi, Lo};
+}
+
 // Implement addition, subtraction, multiplication and division based on:
 // "Software for Doubled-Precision Floating-Point Computations",
 // by Seppo Linnainmaa, ACM TOMS vol 7 no 3, September 1981, pages 272-283.
@@ -5218,10 +5233,78 @@ DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand,
 
 APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) {
   assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
-  auto Ret = Tmp.roundToIntegral(RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
-  return Ret;
+  const APFloat &Hi = getFirst();
+  const APFloat &Lo = getSecond();
+
+  APFloat RoundedHi = Hi;
+  const opStatus HiStatus = RoundedHi.roundToIntegral(RM);
+
+  // We can reduce the problem to just the high part if the input:
+  // 1. Represents a non-finite value.
+  // 2. Has a component which is zero.
+  if (!Hi.isFiniteNonZero() || Lo.isZero()) {
+    Floats[0] = std::move(RoundedHi);
+    Floats[1].makeZero(/*Neg=*/false);
+    return HiStatus;
+  }
+
+  // Adjust `Rounded` in the direction of `TieBreaker` if `ToRound` was at a
+  // halfway point.
+  auto RoundToNearestHelper = [](APFloat ToRound, APFloat Rounded,
+                                 APFloat TieBreaker) {
+    // RoundingError tells us which direction we rounded:
+    //   - RoundingError > 0: we rounded up.
+    //   - RoundingError < 0: we rounded down.
+    // Sterbenz' lemma ensures that RoundingError is exact.
+    const APFloat RoundingError = Rounded - ToRound;
+    if (TieBreaker.isNonZero() &&
+        TieBreaker.isNegative() != RoundingError.isNegative() &&
+        abs(RoundingError).isExactlyValue(0.5))
+      Rounded.add(
+          APFloat::getOne(Rounded.getSemantics(), TieBreaker.isNegative()),
+          rmNearestTiesToEven);
+    return Rounded;
+  };
+
+  // Case 1: Hi is not an integer.
+  // Special cases are for rounding modes that are sensitive to ties.
+  if (RoundedHi != Hi) {
+    // We need to consider the case where Hi was between two integers and the
+    // rounding mode broke the tie when, in fact, Lo may have had a different
+    // sign than Hi.
+    if (RM == rmNearestTiesToAway || RM == rmNearestTiesToEven)
+      RoundedHi = RoundToNearestHelper(Hi, RoundedHi, Lo);
+
+    Floats[0] = std::move(RoundedHi);
+    Floats[1].makeZero(/*Neg=*/false);
+    return HiStatus;
+  }
+
+  // Case 2: Hi is an integer.
+  // Special cases are for rounding modes which are rounding towards or away from zero.
+  RoundingMode LoRoundingMode;
+  if (RM == rmTowardZero)
+    // When our input is positive, we want the Lo component rounded toward
+    // negative infinity to get the smallest result magnitude. Likewise,
+    // negative inputs want the Lo component rounded toward positive infinity.
+    LoRoundingMode = isNegative() ? rmTowardPositive : rmTowardNegative;
+  else
+    LoRoundingMode = RM;
+
+  APFloat RoundedLo = Lo;
+  const opStatus LoStatus = RoundedLo.roundToIntegral(LoRoundingMode);
+  if (LoRoundingMode == rmNearestTiesToAway)
+    // We need to consider the case where Lo was between two integers and the
+    // rounding mode broke the tie when, in fact, Hi may have had a different
+    // sign than Lo.
+    RoundedLo = RoundToNearestHelper(Lo, RoundedLo, Hi);
+
+  // We must ensure that the final result has no overlap between the two APFloat values.
+  std::tie(RoundedHi, RoundedLo) = fastTwoSum(RoundedHi, RoundedLo);
+
+  Floats[0] = std::move(RoundedHi);
+  Floats[1] = std::move(RoundedLo);
+  return LoStatus;
 }
 
 void DoubleAPFloat::changeSign() {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bad7ccd..a40de86b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
       if (IsCalleeWin64) {
         UseVarArgCC = true;
       } else {
-        UseVarArgCC = !Outs[i].IsFixed;
+        UseVarArgCC = ArgFlags.isVarArg();
       }
     }
 
@@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     unsigned NumArgs = Outs.size();
 
     for (unsigned i = 0; i != NumArgs; ++i) {
-      if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
+      if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
         report_fatal_error("Passing SVE types to variadic functions is "
                            "currently not supported");
     }
@@ -13482,7 +13482,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
   // Look for the first non-undef element.
   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
 
-  // Benefit form APInt to handle overflow when calculating expected element.
+  // Benefit from APInt to handle overflow when calculating expected element.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
@@ -13490,7 +13490,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
   // The following shuffle indices must be the successive elements after the
   // first real element.
   bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
-    return Elt != ExpectedElt++ && Elt != -1;
+    return Elt != ExpectedElt++ && Elt >= 0;
   });
   if (FoundWrongElt)
     return false;
@@ -15777,6 +15777,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
           isREVMask(M, EltSize, NumElts, 32) ||
           isREVMask(M, EltSize, NumElts, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          isSingletonEXTMask(M, VT, DummyUnsigned) ||
           isTRNMask(M, NumElts, DummyUnsigned) ||
           isUZPMask(M, NumElts, DummyUnsigned) ||
           isZIPMask(M, NumElts, DummyUnsigned) ||
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d622..fd4ef2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
 //
 // This pass performs below peephole optimizations on MIR level.
 //
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-//    MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+//    MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   // Strategy used to split logical immediate bitmasks.
   enum class SplitStrategy {
     Intersect,
+    Disjoint,
   };
   template <typename T>
   bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
 template <typename T>
 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   T UImm = static_cast<T>(Imm);
+  assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
 
   // The bitmask immediate consists of consecutive ones.  Let's say there is
   // constant 0b00000000001000000000010000000000 which does not consist of
@@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 }
 
 template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+                                    T &Imm2Enc) {
+  assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
+  // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+  // two disjoint masks such as 0b00000000011000000000000000000000 and
+  // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+  // new masks match the original mask.
+  unsigned LowestBitSet = llvm::countr_zero(Imm);
+  unsigned LowestGapBitUnset =
+      LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+  // Create a mask for the least significant group of consecutive ones.
+  assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
+  T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a disjoint mask for the remaining ones.
+  T NewImm2 = Imm & ~NewImm1;
+
+  // Do not split if NewImm2 is not a valid bitmask immediate.
+  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+    return false;
+
+  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+  return true;
+}
+
+template <typename T>
 bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
                                               SplitStrategy Strategy,
                                               unsigned OtherOpc) {
-  // Try below transformation.
+  // Try below transformations.
   //
-  // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-  // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+  // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+  // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
-  // bitmask immediates. It makes only two AND instructions instead of multiple
-  // mov + and instructions.
+  // bitmask immediates based on the given split strategy. It makes only two
+  // logical instructions instead of multiple mov + logic instructions.
 
   return splitTwoPartImm<T>(
       MI,
@@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
         case SplitStrategy::Intersect:
           SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
           break;
+        case SplitStrategy::Disjoint:
+          SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+          break;
         }
         if (SplitSucc)
           return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
         Changed |= trySplitLogicalImm<uint64_t>(
             AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
         break;
+      case AArch64::EORWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::EORXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
       case AArch64::ORRWrs:
         Changed |= visitORR(MI);
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index adc984a..1bc1d98 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
@@ -45,7 +46,8 @@ def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
@@ -53,7 +55,8 @@ def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
@@ -756,7 +759,6 @@ def ProcessorFeatures {
                                  FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
                                  FeatureComplxNum, FeatureCRC, FeatureDotProd,
                                  FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -766,7 +768,6 @@ def ProcessorFeatures {
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
                                  FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureDotProd, FeatureFPAC];
   list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 010d0aaa..2155ace 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner
     bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
 
     bool Res;
-    if (Info.IsFixed && !UseVarArgsCCForFixed) {
+    if (!Flags.isVarArg() && !UseVarArgsCCForFixed) {
       if (!IsReturn)
         applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
       Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
@@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
     unsigned MaxSize = MemTy.getSizeInBytes() * 8;
     // For varargs, we always want to extend them to 8 bytes, in which case
     // we disable setting a max.
-    if (!Arg.IsFixed)
+    if (Arg.Flags[0].isVarArg())
       MaxSize = 0;
 
     Register ValVReg = Arg.Regs[RegIndex];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d84f512..f266398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1013,6 +1013,14 @@ def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   "device memory."
 >;
 
+def FeatureEmulatedSystemScopeAtomics
+  : SubtargetFeature<"emulated-system-scope-atomics",
+  "HasEmulatedSystemScopeAtomics",
+  "true",
+  "System scope atomics unsupported by the PCI-e are emulated in HW via CAS "
+  "loop and functional."
+>;
+
 def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
   "HasDefaultComponentZero",
   "true",
@@ -2062,6 +2070,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureAtomicFMinFMaxF64FlatInsts,
    FeatureFlatBufferGlobalAtomicFaddF64Inst,
    FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureEmulatedSystemScopeAtomics,
    FeatureGloballyAddressableScratch,
    FeatureKernargPreload,
    FeatureVmemPrefInsts,
@@ -2603,6 +2612,10 @@ def HasPkMinMax3Insts :
   Predicate<"Subtarget->hasPkMinMax3Insts()">,
   AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
 
+def HasSGetShaderCyclesInst :
+  Predicate<"Subtarget->hasSGetShaderCyclesInst()">,
+  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
 def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
   AssemblerPredicate<(all_of FeatureImageInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a0c99b0..846a0b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
     return true;
 
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
+    Intrinsic::ID IID = Intrinsic->getIntrinsicID();
+    switch (IID) {
+    case Intrinsic::read_register:
       return isReadRegisterSourceOfDivergence(Intrinsic);
-
-    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
+    case Intrinsic::amdgcn_addrspacecast_nonnull: {
+      unsigned SrcAS =
+          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
+      unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
+      return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+             DstAS == AMDGPUAS::FLAT_ADDRESS &&
+             ST->hasGloballyAddressableScratch();
+    }
+    default:
+      return AMDGPU::isIntrinsicSourceOfDivergence(IID);
+    }
   }
 
   // Assume all function calls are a source of divergence.
@@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (isa<InvokeInst>(V))
     return true;
 
+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
+    return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+           CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+           ST->hasGloballyAddressableScratch();
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5530886..f47ddf5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -187,6 +187,7 @@ protected:
   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
+  bool HasEmulatedSystemScopeAtomics = false;
   bool HasDefaultComponentBroadcast = false;
   bool HasXF32Insts = false;
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
@@ -950,6 +951,12 @@ public:
     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
   }
 
+  /// \return true is HW emulates system scope atomics unsupported by the PCI-e
+  /// via CAS loop.
+  bool hasEmulatedSystemScopeAtomics() const {
+    return HasEmulatedSystemScopeAtomics;
+  }
+
   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
 
   bool hasDefaultComponentBroadcast() const {
@@ -1081,7 +1088,7 @@ public:
   }
 
   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
-  bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
+  bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
 
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
@@ -1555,12 +1562,16 @@ public:
   // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
   bool hasPkMinMax3Insts() const { return GFX1250Insts; }
 
+  // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
+  bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
+
   // \returns true if target has S_SETPRIO_INC_WG instruction.
   bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
 
   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
-  // of sign-extending.
-  bool hasGetPCZeroExtension() const { return GFX12Insts; }
+  // of sign-extending. Note that GFX1250 has not only fixed the bug but also
+  // extended VA to 57 bits.
+  bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
 
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index deadb7a..2d0102f 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -536,6 +536,10 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
   ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
   ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
+
+  // GFX1250
+  ID_XNACK_STATE_PRIV = 33,
+  ID_XNACK_MASK_gfx1250 = 34,
 };
 
 enum Offset : unsigned { // Offset, (5) [10:6]
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 63826b7..8f44c03 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17695,6 +17695,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
     if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
         RMW->hasMetadata("amdgpu.no.remote.memory"))
       return true;
+    if (Subtarget.hasEmulatedSystemScopeAtomics())
+      return true;
   } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
     return true;
 
@@ -17942,8 +17944,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   case AtomicRMWInst::UMax: {
     if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
         AS == AMDGPUAS::BUFFER_FAT_POINTER) {
-      // Always expand system scope min/max atomics.
-      if (HasSystemScope)
+      if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics())
         return AtomicExpansionKind::CmpXChg;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3f61bbd..f20b22d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6122,10 +6122,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
           !Op.isIdenticalTo(*MO))
         return false;
 
-      // Do not fold a frame index into an instruction that already has a frame
-      // index. The frame index handling code doesn't handle fixing up operand
-      // constraints if there are multiple indexes.
-      if (Op.isFI() && MO->isFI())
+      // Do not fold a non-inlineable and non-register operand into an
+      // instruction that already has a frame index. The frame index handling
+      // code could not handle well when a frame index co-exists with another
+      // non-register operand, unless that operand is an inlineable immediate.
+      if (Op.isFI())
         return false;
     }
   } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
@@ -10073,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 
 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   unsigned opcode = MI.getOpcode();
+
+  auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+                                       : MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    LLT SrcTy = MRI.getType(Src);
+    unsigned DstAS = DstTy.getAddressSpace();
+    unsigned SrcAS = SrcTy.getAddressSpace();
+    return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+                   DstAS == AMDGPUAS::FLAT_ADDRESS &&
+                   ST.hasGloballyAddressableScratch()
+               ? InstructionUniformity::NeverUniform
+               : InstructionUniformity::Default;
+  };
+
+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+    return HandleAddrSpaceCast(MI);
+
   if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
     auto IID = GI->getIntrinsicID();
     if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@@ -10082,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
       return InstructionUniformity::AlwaysUniform;
 
     switch (IID) {
+    case Intrinsic::amdgcn_addrspacecast_nonnull:
+      return HandleAddrSpaceCast(MI);
     case Intrinsic::amdgcn_if:
     case Intrinsic::amdgcn_else:
       // FIXME: Uniform if second result
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8303410..431d73b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1653,6 +1653,12 @@ def S_SETPRIO_INC_WG : SOPP_Pseudo <"s_setprio_inc_wg", (ins i16imm:$simm16), "$
   let SubtargetPredicate = HasSetPrioIncWgInst;
 }
 
+def S_GET_SHADER_CYCLES_U64 : SOP1_64_0 <"s_get_shader_cycles_u64",
+  [(set i64:$sdst, (readcyclecounter))]> {
+  let SubtargetPredicate = HasSGetShaderCyclesInst;
+  let hasSideEffects = 1;
+}
+
 let Uses = [EXEC, M0] in {
 def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16",
   [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
@@ -2145,6 +2151,7 @@ defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
 // GFX1250
+defm S_GET_SHADER_CYCLES_U64      : SOP1_Real_gfx12<0x06>;
 defm S_ADD_PC_I64                 : SOP1_Real_gfx12<0x04b>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index e433b85..3d9455f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -223,6 +223,10 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940},
   {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940},
 
+  // GFX1250
+  {{"HW_REG_XNACK_STATE_PRIV"},       ID_XNACK_STATE_PRIV,       isGFX1250},
+  {{"HW_REG_XNACK_MASK"},             ID_XNACK_MASK_gfx1250,     isGFX1250},
+
   // Aliases
   {{"HW_REG_HW_ID"},                  ID_HW_ID1,                 isGFX10},
 };
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a..25ad9ec 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: {
         // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
         // us to  fold the constant into the cmp instruction.
-        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
         CC = ISD::SETGE;
         break;
       }
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
     // fold the constant into the cmp instruction.
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
-      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+      // already to something else. Assert to make sure this assumption holds.
+      assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+      RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
       CC = ISD::SETUGE;
       break;
     }
diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
index 73abfe7..306db6a 100644
--- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
+++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
@@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
 
   for (LoadInst *LI : LoadsToProcess) {
     Value *V = LI->getPointerOperand();
-    auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+    auto *GV = dyn_cast<GlobalVariable>(V);
 
     // If we didn't find the global, we may need to walk through a level of
     // indirection. This generally happens at -O0.
-    if (!GV)
+    if (!GV) {
       if (auto *NestedLI = dyn_cast<LoadInst>(V)) {
         BasicBlock::iterator BBI(NestedLI);
         Value *Loaded = FindAvailableLoadedValue(
             NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr);
         GV = dyn_cast_or_null<GlobalVariable>(Loaded);
+      } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) {
+        for (auto &Use : NestedAlloca->uses()) {
+          auto *Store = dyn_cast<StoreInst>(Use.getUser());
+          if (!Store)
+            continue;
+
+          Value *StoredVal = Store->getValueOperand();
+          if (!StoredVal)
+            continue;
+
+          // Try direct global match
+          GV = dyn_cast<GlobalVariable>(StoredVal);
+          if (GV)
+            break;
+
+          // If it's a load, check its source
+          if (auto *Load = dyn_cast<LoadInst>(StoredVal)) {
+            GV = dyn_cast<GlobalVariable>(Load->getPointerOperand());
+            if (GV)
+              break;
+
+            // If loading from an unmodified stack copy of the global, reuse the
+            // global's value. Note: we are just repeating what we are doing for
+            // the load case for the alloca store pattern.
+            BasicBlock::iterator BBI(Load);
+            Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(),
+                                                     BBI, 0, nullptr, nullptr);
+            GV = dyn_cast<GlobalVariable>(Loaded);
+            if (GV)
+              break;
+          }
+        }
       }
+    }
 
     auto It = HandleMap.find(GV);
     if (It == HandleMap.end()) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a5bf0e5..6583a0f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State,
 static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
                          unsigned ValNo, MVT ValVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                         CCState &State, bool IsFixed, bool IsRet,
-                         Type *OrigTy) {
+                         CCState &State, bool IsRet, Type *OrigTy) {
   unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits();
   assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen");
   MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64;
@@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
   case LoongArchABI::ABI_LP64F:
   case LoongArchABI::ABI_ILP32D:
   case LoongArchABI::ABI_LP64D:
-    UseGPRForFloat = !IsFixed;
+    UseGPRForFloat = ArgFlags.isVarArg();
     break;
   case LoongArchABI::ABI_ILP32S:
   case LoongArchABI::ABI_LP64S:
@@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
   // will not be passed by registers if the original type is larger than
   // 2*GRLen, so the register alignment rule does not apply.
   unsigned TwoGRLenInBytes = (2 * GRLen) / 8;
-  if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
+  if (ArgFlags.isVarArg() &&
+      ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes &&
       DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) {
     unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
     // Skip 'odd' register if necessary.
@@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs(
     LoongArchABI::ABI ABI =
         MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags,
-           CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
+           CCInfo, IsRet, ArgTy)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT
                         << '\n');
       llvm_unreachable("");
@@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs(
     LoongArchABI::ABI ABI =
         MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags,
-           CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+           CCInfo, IsRet, OrigTy)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT
                         << "\n");
       llvm_unreachable("");
@@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn(
     LoongArchABI::ABI ABI =
         MF.getSubtarget<LoongArchSubtarget>().getTargetABI();
     if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full,
-                     Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
-                     nullptr))
+                     Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6b49a98f..f79ba74 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -330,7 +330,7 @@ private:
                                    unsigned ValNo, MVT ValVT,
                                    CCValAssign::LocInfo LocInfo,
                                    ISD::ArgFlagsTy ArgFlags, CCState &State,
-                                   bool IsFixed, bool IsRet, Type *OrigTy);
+                                   bool IsRet, Type *OrigTy);
 
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index 9e8cd2e..13237c5 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
   OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
 }
 
-void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed,
-                                        const char *Func) {
+void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) {
   OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
   OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
   OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
-  CallOperandIsFixed.push_back(IsFixed);
 }
 
 /// Identify lowered values that originated from f128, float and sret to vXfXX
@@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands(
     OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
     OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
     OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
-    CallOperandIsFixed.push_back(Outs[i].IsFixed);
   }
 }
 
diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h
index 4229da5..30b68e8 100644
--- a/llvm/lib/Target/Mips/MipsCCState.h
+++ b/llvm/lib/Target/Mips/MipsCCState.h
@@ -36,7 +36,7 @@ public:
   static bool originalEVTTypeIsVectorFloat(EVT Ty);
   static bool originalTypeIsVectorFloat(const Type *Ty);
 
-  void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func);
+  void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func);
 
   void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
   void PreAnalyzeReturnValue(EVT ArgVT);
@@ -86,10 +86,6 @@ private:
   /// vector.
   SmallVector<bool, 4> OriginalRetWasFloatVector;
 
-  /// Records whether the value was a fixed argument.
-  /// See ISD::OutputArg::IsFixed,
-  SmallVector<bool, 4> CallOperandIsFixed;
-
   // Used to handle MIPS16-specific calling convention tweaks.
   // FIXME: This should probably be a fully fledged calling convention.
   SpecialCallingConvType SpecialCallingConv;
@@ -106,7 +102,6 @@ public:
     OriginalArgWasF128.clear();
     OriginalArgWasFloat.clear();
     OriginalArgWasFloatVector.clear();
-    CallOperandIsFixed.clear();
     PreAnalyzeCallOperands(Outs, FuncArgs, Func);
   }
 
@@ -213,7 +208,6 @@ public:
   bool WasOriginalRetVectorFloat(unsigned ValNo) const {
     return OriginalRetWasFloatVector[ValNo];
   }
-  bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
   SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
 };
 }
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 555773a..fa49108 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
     if (IsReturn)
       State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
     else
-      State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func);
+      State.PreAnalyzeCallOperand(Info.Ty, Func);
 
     return CallLowering::OutgoingValueAssigner::assignArg(
         ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 39e184a..0e5c16c 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A>
 class CCIfOrigArgWasF128<CCAction A>
     : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
 
-/// Match if this specific argument is a vararg.
-/// This is slightly different fro CCIfIsVarArg which matches if any argument is
-/// a vararg.
-class CCIfArgIsVarArg<CCAction A>
-    : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
-
 /// Match if the return was a floating point vector.
 class CCIfOrigArgWasNotVectorFloat<CCAction A>
     : CCIf<"!static_cast<MipsCCState *>(&State)"
@@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[
 ]>;
 
 def CC_Mips : CallingConv<[
-  CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
+  CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
   CCDelegateTo<CC_Mips_FixedArg>
 ]>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 15f45a1..d4f0cc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   if (STI.allowFP16Math() || STI.hasBF16Math())
     setTargetDAGCombine(ISD::SETCC);
 
+  // Vector reduction operations. These may be turned into shuffle or tree
+  // reductions depending on what instructions are available for each type.
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+    MVT EltVT = VT.getVectorElementType();
+    if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+      setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+                          ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+                         VT, Custom);
+    }
+  }
+
   // Promote fp16 arithmetic if fp16 hardware isn't available or the
   // user passed --nvptx-no-fp16-math. The flag is useful because,
   // although sm_53+ GPUs have some sort of FP16 support in
@@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
     MAKE_CASE(NVPTXISD::FCOPYSIGN)
+    MAKE_CASE(NVPTXISD::FMAXNUM3)
+    MAKE_CASE(NVPTXISD::FMINNUM3)
+    MAKE_CASE(NVPTXISD::FMAXIMUM3)
+    MAKE_CASE(NVPTXISD::FMINIMUM3)
     MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
     MAKE_CASE(NVPTXISD::STACKRESTORE)
     MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
   return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
 }
 
+/// Reduces the elements using the scalar operations provided. The operations
+/// are sorted descending in number of inputs they take. The flags on the
+/// original reduction operation will be propagated to each scalar operation.
+/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
+/// used in ExpandReductions and SelectionDAG.
+static SDValue buildTreeReduction(
+    const SmallVector<SDValue> &Elements, EVT EltTy,
+    ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+    const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+  // Build the reduction tree at each level, starting with all the elements.
+  SmallVector<SDValue> Level = Elements;
+
+  unsigned OpIdx = 0;
+  while (Level.size() > 1) {
+    // Try to reduce this level using the current operator.
+    const auto [Op, NumInputs] = Ops[OpIdx];
+
+    // Build the next level by partially reducing all elements.
+    SmallVector<SDValue> ReducedLevel;
+    unsigned I = 0, E = Level.size();
+    for (; I + NumInputs <= E; I += NumInputs) {
+      // Reduce elements in groups of [NumInputs], as much as possible.
+      ReducedLevel.push_back(DAG.getNode(
+          Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
+    }
+
+    if (I < E) {
+      // Handle leftover elements.
+
+      if (ReducedLevel.empty()) {
+        // We didn't reduce anything at this level. We need to pick a smaller
+        // operator.
+        ++OpIdx;
+        assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+        continue;
+      }
+
+      // We reduced some things but there's still more left, meaning the
+      // operator's number of inputs doesn't evenly divide this level size. Move
+      // these elements to the next level.
+      for (; I < E; ++I)
+        ReducedLevel.push_back(Level[I]);
+    }
+
+    // Process the next level.
+    Level = ReducedLevel;
+  }
+
+  return *Level.begin();
+}
+
+// Get scalar reduction opcode
+static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
+  switch (ReductionOpcode) {
+  case ISD::VECREDUCE_FMAX:
+    return ISD::FMAXNUM;
+  case ISD::VECREDUCE_FMIN:
+    return ISD::FMINNUM;
+  case ISD::VECREDUCE_FMAXIMUM:
+    return ISD::FMAXIMUM;
+  case ISD::VECREDUCE_FMINIMUM:
+    return ISD::FMINIMUM;
+  default:
+    llvm_unreachable("unhandled reduction opcode");
+  }
+}
+
+/// Get 3-input scalar reduction opcode
+static std::optional<NVPTXISD::NodeType>
+getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
+  switch (ReductionOpcode) {
+  case ISD::VECREDUCE_FMAX:
+    return NVPTXISD::FMAXNUM3;
+  case ISD::VECREDUCE_FMIN:
+    return NVPTXISD::FMINNUM3;
+  case ISD::VECREDUCE_FMAXIMUM:
+    return NVPTXISD::FMAXIMUM3;
+  case ISD::VECREDUCE_FMINIMUM:
+    return NVPTXISD::FMINIMUM3;
+  default:
+    return std::nullopt;
+  }
+}
+
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const SDNodeFlags Flags = Op->getFlags();
+  SDValue Vector = Op.getOperand(0);
+
+  const unsigned Opcode = Op->getOpcode();
+  const EVT EltTy = Vector.getValueType().getVectorElementType();
+
+  // Whether we can use 3-input min/max when expanding the reduction.
+  const bool CanUseMinMax3 =
+      EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+      STI.getPTXVersion() >= 88 &&
+      (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
+       Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
+
+  // A list of SDNode opcodes with equivalent semantics, sorted descending by
+  // number of inputs they take.
+  SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+
+  if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
+      CanUseMinMax3 && Opcode3Elem)
+    ScalarOps.push_back({*Opcode3Elem, 3});
+  ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
+
+  SmallVector<SDValue> Elements;
+  DAG.ExtractVectorElements(Vector, Elements);
+
+  return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+}
+
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::CONCAT_VECTORS:
     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    return LowerVECREDUCE(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
@@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    auto &DL = I.getDataLayout();
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = getPointerTy(DL);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.flags =
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+    Info.align.reset();
+    return true;
+  }
+
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf72a1e..43e721a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -64,6 +64,11 @@ enum NodeType : unsigned {
   UNPACK_VECTOR,
 
   FCOPYSIGN,
+  FMAXNUM3,
+  FMINNUM3,
+  FMAXIMUM3,
+  FMINIMUM3,
+
   DYNAMIC_STACKALLOC,
   STACKRESTORE,
   STACKSAVE,
@@ -286,6 +291,7 @@ private:
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index aac611d..1ab41bf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
                Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 }
 
+// Template for 3-input minimum/maximum instructions
+// (sm_100+/PTX 8.8 and f32 only)
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> {
+  defvar nan_str = !if(NaN, ".NaN", "");
+   def f32rrr :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, B32:$b, B32:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rri :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, B32:$b, f32imm:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rii :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, f32imm:$b, f32imm:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
 // Template for instructions which take three FP args.  The
 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
 //
@@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
 defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
 defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
 
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+                            [SDNPCommutative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>;
+
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
 defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d337192..d4a0ca7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -39,6 +39,12 @@ def AS_match {
   code global = [{
    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
   }];
+  code const = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+  }];
+  code param = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+  }];
 }
 
 
@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
 defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
                                      [hasTMACTAGroupSupport]>;
 
-//Prefetch and Prefetchu 
-
-let Predicates = [hasPTX<80>, hasSM<90>] in {
-  class PREFETCH_INTRS<string InstName> :
-            BasicNVPTXInst<(outs), (ins ADDR:$addr),
-            InstName,
-            [(!cast<Intrinsic>(!strconcat("int_nvvm_",
-            !subst(".", "_", InstName))) addr:$addr)]>;
+//Prefetchu and Prefetch
 
-  def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
-  def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
-  def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
-  def PREFETCH_LOCAL_L1  : PREFETCH_INTRS<"prefetch.local.L1">;
-  def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
-  def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
+defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
 
-  def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_normal",
-                                        [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
+multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
+  def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
+}
 
-  def PREFETCH_GLOBAL_L2_EVICT_LAST   : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_last",
-                                        [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
 
-  def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+           "prefetch" # addrspace_name # ".tensormap",
+           [(pattern_frag addr:$addr)]>,
+           Requires<[hasPTX<80>, hasSM<90>]>;
 }
 
+defm PREFETCH_CONST_TENSORMAP   : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
+defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
+defm PREFETCH_PARAM_TENSORMAP   : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
+  
+class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr),
+          InstName,
+          [(Intr addr:$addr)]>,
+          Requires<[hasPTX<80>, hasSM<90>]>;
+
+def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;   
+def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
+def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
+def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
+def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
+def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
+def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", 
+                                      int_nvvm_prefetch_global_L2_evict_normal>;
+def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", 
+                                    int_nvvm_prefetch_global_L2_evict_last>;
+
 //Applypriority intrinsics
 class APPLYPRIORITY_L2_INTRS<string addrspace> :
           BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3ae2d9d..f4f8961 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   case Intrinsic::nvvm_isspacep_global:
   case Intrinsic::nvvm_isspacep_local:
   case Intrinsic::nvvm_isspacep_shared:
-  case Intrinsic::nvvm_isspacep_shared_cluster: {
+  case Intrinsic::nvvm_isspacep_shared_cluster:
+  case Intrinsic::nvvm_prefetch_tensormap: {
     OpIndexes.push_back(0);
     return true;
   }
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
       return ConstantInt::get(II->getType(), *R);
     return nullptr;
   }
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    IRBuilder<> Builder(II);
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+                                        NewV);
+  }
   }
   return nullptr;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 9a6e261..b32d931b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -87,6 +87,13 @@ public:
   }
   unsigned getMinVectorRegisterBitWidth() const override { return 32; }
 
+  bool shouldExpandReduction(const IntrinsicInst *II) const override {
+    // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced
+    // swizzling operations. Our backend/Selection DAG can expand these
+    // reductions with less movs.
+    return false;
+  }
+
   // We don't want to prevent inlining because of target-cpu and -features
   // attributes that were added to newer versions of LLVM/Clang: There are
   // no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h
index b0e50b2..feab9c5 100644
--- a/llvm/lib/Target/PowerPC/PPCCCState.h
+++ b/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -38,36 +38,6 @@ public:
   void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
 };
 
-class AIXCCState : public CCState {
-private:
-  BitVector IsFixed;
-
-public:
-  AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
-             SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C)
-      : CCState(CC, IsVarArg, MF, Locs, C) {}
-
-  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              CCAssignFn Fn) {
-    // All formal arguments are fixed.
-    IsFixed.resize(Ins.size(), true);
-    CCState::AnalyzeFormalArguments(Ins, Fn);
-  }
-
-  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                           CCAssignFn Fn) {
-    // Record whether the call operand was a fixed argument.
-    IsFixed.resize(Outs.size(), false);
-    for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo)
-      if (Outs[ValNo].IsFixed)
-        IsFixed.set(ValNo);
-
-    CCState::AnalyzeCallOperands(Outs, Fn);
-  }
-
-  bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); }
-};
-
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 47f0038..2698bd6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3925,9 +3925,6 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  if (Subtarget.isAIXABI())
-    report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
-
   return Op.getOperand(0);
 }
 
@@ -3984,9 +3981,6 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  if (Subtarget.isAIXABI())
-    report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
-
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -3994,6 +3988,65 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDLoc dl(Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Subtarget.isAIXABI()) {
+    // On AIX we create a trampoline descriptor by combining the
+    // entry point and TOC from the global descriptor (FPtr) with the
+    // nest argument as the environment pointer.
+    uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4;
+    MaybeAlign PointerAlign(PointerSize);
+    auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
+                        ? (MachineMemOperand::MODereferenceable |
+                           MachineMemOperand::MOInvariant)
+                        : MachineMemOperand::MONone;
+
+    uint64_t TOCPointerOffset = 1 * PointerSize;
+    uint64_t EnvPointerOffset = 2 * PointerSize;
+    SDValue SDTOCPtrOffset = DAG.getConstant(TOCPointerOffset, dl, PtrVT);
+    SDValue SDEnvPtrOffset = DAG.getConstant(EnvPointerOffset, dl, PtrVT);
+
+    const Value *TrampolineAddr =
+        cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+    const Function *Func =
+        cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+
+    SDValue OutChains[3];
+
+    // Copy the entry point address from the global descriptor to the
+    // trampoline buffer.
+    SDValue LoadEntryPoint =
+        DAG.getLoad(PtrVT, dl, Chain, FPtr, MachinePointerInfo(Func, 0),
+                    PointerAlign, MMOFlags);
+    SDValue EPLoadChain = LoadEntryPoint.getValue(1);
+    OutChains[0] = DAG.getStore(EPLoadChain, dl, LoadEntryPoint, Trmp,
+                                MachinePointerInfo(TrampolineAddr, 0));
+
+    // Copy the TOC pointer from the global descriptor to the trampoline
+    // buffer.
+    SDValue TOCFromDescriptorPtr =
+        DAG.getNode(ISD::ADD, dl, PtrVT, FPtr, SDTOCPtrOffset);
+    SDValue TOCReg = DAG.getLoad(PtrVT, dl, Chain, TOCFromDescriptorPtr,
+                                 MachinePointerInfo(Func, TOCPointerOffset),
+                                 PointerAlign, MMOFlags);
+    SDValue TrampolineTOCPointer =
+        DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDTOCPtrOffset);
+    SDValue TOCLoadChain = TOCReg.getValue(1);
+    OutChains[1] =
+        DAG.getStore(TOCLoadChain, dl, TOCReg, TrampolineTOCPointer,
+                     MachinePointerInfo(TrampolineAddr, TOCPointerOffset));
+
+    // Store the nest argument into the environment pointer in the trampoline
+    // buffer.
+    SDValue EnvPointer = DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDEnvPtrOffset);
+    OutChains[2] =
+        DAG.getStore(Chain, dl, Nest, EnvPointer,
+                     MachinePointerInfo(TrampolineAddr, EnvPointerOffset));
+
+    SDValue TokenFactor =
+        DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+    return TokenFactor;
+  }
+
   bool isPPC64 = (PtrVT == MVT::i64);
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
@@ -6036,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       bool Result;
 
-      if (Outs[i].IsFixed) {
+      if (!ArgFlags.isVarArg()) {
         Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
                                CCInfo);
       } else {
@@ -6852,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
 
 static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                   CCState &S) {
-  AIXCCState &State = static_cast<AIXCCState &>(S);
+                   CCState &State) {
   const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
       State.getMachineFunction().getSubtarget());
   const bool IsPPC64 = Subtarget.isPPC64();
@@ -6865,9 +6917,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   if (ValVT == MVT::f128)
     report_fatal_error("f128 is unimplemented on AIX.");
 
-  if (ArgFlags.isNest())
-    report_fatal_error("Nest arguments are unimplemented.");
-
   static const MCPhysReg GPR_32[] = {// 32-bit registers.
                                      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
                                      PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -6882,6 +6931,14 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
 
   const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
 
+  if (ArgFlags.isNest()) {
+    MCRegister EnvReg = State.AllocateReg(IsPPC64 ? PPC::X11 : PPC::R11);
+    if (!EnvReg)
+      report_fatal_error("More then one nest argument.");
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, EnvReg, RegVT, LocInfo));
+    return false;
+  }
+
   if (ArgFlags.isByVal()) {
     const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
     if (ByValAlign > StackAlign)
@@ -7032,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     // They are passed in VRs if any are available (unlike arguments passed
     // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
     // functions)
-    if (State.isFixed(ValNo)) {
+    if (!ArgFlags.isVarArg()) {
       if (MCRegister VReg = State.AllocateReg(VR)) {
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
         // Shadow allocate GPRs and stack space even though we pass in a VR.
@@ -7220,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   const EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Reserve space for the linkage area on the stack.
@@ -7567,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<CCValAssign, 16> ArgLocs;
-  AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
-                    *DAG.getContext());
+  CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+                 *DAG.getContext());
 
   // Reserve space for the linkage save area (LSA) on the stack.
   // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index d2b75a6..34026ed 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -45,8 +45,8 @@ public:
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
                  CCState &State) override {
-    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed,
-                      IsRet, Info.Ty))
+    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+                      Info.Ty))
       return true;
 
     StackSize = State.getStackSize();
@@ -196,8 +196,8 @@ public:
     if (LocVT.isScalableVector())
       MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
 
-    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State,
-                      /*IsFixed=*/true, IsRet, Info.Ty))
+    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet,
+                      Info.Ty))
       return true;
 
     StackSize = State.getStackSize();
@@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
   for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
     MVT VT = MVT::getVT(Outs[I].Ty);
     if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo,
-                 /*IsFixed=*/true, /*isRet=*/true, nullptr))
+                 /*isRet=*/true, nullptr))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index cb6117e..70127e3 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
 // Implements the RISC-V calling convention. Returns true upon failure.
 bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
+                    CCState &State, bool IsRet, Type *OrigTy) {
   const MachineFunction &MF = State.getMachineFunction();
   const DataLayout &DL = MF.getDataLayout();
   const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
     break;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
-    UseGPRForF16_F32 = !IsFixed;
+    UseGPRForF16_F32 = ArgFlags.isVarArg();
     break;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
-    UseGPRForF16_F32 = !IsFixed;
-    UseGPRForF64 = !IsFixed;
+    UseGPRForF16_F32 = ArgFlags.isVarArg();
+    UseGPRForF64 = ArgFlags.isVarArg();
     break;
   }
 
@@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
   // currently if we are using ILP32E calling convention. This behavior may be
   // changed when RV32E/ILP32E is ratified.
   unsigned TwoXLenInBytes = (2 * XLen) / 8;
-  if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
+  if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
       DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes &&
       ABI != RISCVABI::ABI_ILP32E) {
     unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
@@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
 // benchmark. But theoretically, it may have benefit for some cases.
 bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State,
-                           bool IsFixed, bool IsRet, Type *OrigTy) {
+                           ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet,
+                           Type *OrigTy) {
   const MachineFunction &MF = State.getMachineFunction();
   const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering();
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index bf823b7..2030ce1 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -21,15 +21,15 @@ namespace llvm {
 typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State,
-                             bool IsFixed, bool IsRet, Type *OrigTy);
+                             bool IsRet, Type *OrigTy);
 
 bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
               CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-              CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+              CCState &State, bool IsRet, Type *OrigTy);
 
 bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+                     CCState &State, bool IsRet, Type *OrigTy);
 
 bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 171940e..a7329d2 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1700,6 +1700,18 @@ def TuneNLogNVRGather
 def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
+def TuneDisableMISchedLoadClustering : SubtargetFeature<"disable-misched-load-clustering",
+    "EnableMISchedLoadClustering", "false", "Disable load clustering in the machine scheduler">;
+
+def TuneDisableMISchedStoreClustering : SubtargetFeature<"disable-misched-store-clustering",
+    "EnableMISchedStoreClustering", "false", "Disable store clustering in the machine scheduler">;
+
+def TuneDisablePostMISchedLoadClustering : SubtargetFeature<"disable-postmisched-load-clustering",
+    "EnablePostMISchedLoadClustering", "false", "Disable PostRA load clustering in the machine scheduler">;
+
+def TuneDisablePostMISchedStoreClustering : SubtargetFeature<"disable-postmisched-store-clustering",
+    "EnablePostMISchedStoreClustering", "false", "Disable PostRA store clustering in the machine scheduler">;
+
 def TuneDisableLatencySchedHeuristic
     : SubtargetFeature<"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
                        "Disable latency scheduling heuristic">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03e54b3..e4aa8b8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22282,8 +22282,8 @@ void RISCVTargetLowering::analyzeInputArgs(
     else if (In.isOrigArg())
       ArgTy = FType->getParamType(In.getOrigArgIndex());
 
-    if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo,
-           /*IsFixed=*/true, IsRet, ArgTy)) {
+    if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+           ArgTy)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type "
                         << ArgVT << '\n');
       llvm_unreachable(nullptr);
@@ -22300,8 +22300,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
     ISD::ArgFlagsTy ArgFlags = Out.Flags;
     Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr;
 
-    if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed,
-           IsRet, OrigTy)) {
+    if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
+           OrigTy)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type "
                         << ArgVT << "\n");
       llvm_unreachable(nullptr);
@@ -23083,7 +23083,7 @@ bool RISCVTargetLowering::CanLowerReturn(
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
-                 /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+                 /*IsRet=*/true, nullptr))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 04ffb05..413ad8b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
 def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
                    (zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
           (PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
-              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
 
 def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
                                (zexti8 (XLenVT GPR:$rs1))),
@@ -642,11 +639,15 @@ let Predicates = [HasStdExtZbkb, IsRV32] in {
 def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
           (PACK GPR:$rs1, GPR:$rs2)>;
 
+def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
+              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
 // Match a pattern of 2 bytes being inserted into bits [31:16], with bits
 // bits [15:0] coming from a zero extended value. We can use pack with packh for
 // bits [31:16]. If bits [15:0] can also be a packh, it can be matched
 // separately.
-def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
               (zexti16 (XLenVT GPR:$rs1))),
           (PACK (XLenVT GPR:$rs1),
@@ -657,12 +658,40 @@ let Predicates = [HasStdExtZbkb, IsRV64] in {
 def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
           (PACK GPR:$rs1, GPR:$rs2)>;
 
+def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
+              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
+                               (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+
 def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
                                (zexti16 (i64 GPR:$rs1))),
           (PACKW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
                    (zexti16 (i64 GPR:$rs1)))),
           (PACKW GPR:$rs1, GPR:$rs2)>;
+
+// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
+// bits [15:0] coming from a zero extended value, and bits [63:32] being
+// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can
+// also be a packh, it can be matched separately.
+def : Pat<(binop_allwusers<or>
+               (or (shl GPR:$op1rs2, (XLenVT 24)),
+                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (zexti16 (XLenVT GPR:$rs1))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+// We need to manually reassociate the patterns because of the binop_allwusers.
+def : Pat<(binop_allwusers<or>
+               (or (zexti16 (XLenVT GPR:$rs1)),
+                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (shl GPR:$op1rs2, (XLenVT 24))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(binop_allwusers<or>
+               (or (zexti16 (XLenVT GPR:$rs1)),
+                   (shl GPR:$op1rs1, (XLenVT 24))),
+               (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
 } // Predicates = [HasStdExtZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
index 875a93d..39e099b 100644
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.td
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
@@ -91,3 +91,59 @@ def TuneLDADDFusion
                    CheckIsImmOperand<2>,
                    CheckImmOperand<2, 0>
                  ]>>;
+
+defvar Load = [LB, LH, LW, LD, LBU, LHU, LWU];
+
+// Fuse add(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu):
+//   add(.uw) rd, rs1, rs2
+//   load rd, imm12(rd)
+def TuneADDLoadFusion
+  : SimpleFusion<"add-load-fusion", "HasADDLoadFusion", "Enable ADD(.UW) + load macrofusion",
+                 CheckOpcode<[ADD, ADD_UW]>,
+                 CheckOpcode<Load>>;
+
+// Fuse AUIPC followed by by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+//   auipc rd, imm20
+//   load rd, imm12(rd)
+def TuneAUIPCLoadFusion
+  : SimpleFusion<"auipc-load-fusion", "HasAUIPCLoadFusion",
+                 "Enable AUIPC + load macrofusion",
+                 CheckOpcode<[AUIPC]>,
+                 CheckOpcode<Load>>;
+
+// Fuse LUI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+//   lui rd, imm[31:12]
+//   load rd, imm12(rd)
+def TuneLUILoadFusion
+  : SimpleFusion<"lui-load-fusion", "HasLUILoadFusion",
+                 "Enable LUI + load macrofusion",
+                 CheckOpcode<[LUI]>,
+                 CheckOpcode<Load>>;
+
+// Bitfield extract fusion: similar to TuneShiftedZExtWFusion
+// but without the immediate restriction
+//   slli rd, rs1, imm12
+//   srli rd, rd, imm12
+def TuneBFExtFusion
+  : SimpleFusion<"bfext-fusion", "HasBFExtFusion",
+                 "Enable SLLI+SRLI (bitfield extract) macrofusion",
+                 CheckOpcode<[SLLI]>,
+                 CheckOpcode<[SRLI]>>;
+
+// Fuse ADDI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+//   addi rd, rs1, imm12
+//   load rd, imm12(rd)
+def TuneADDILoadFusion
+  : SimpleFusion<"addi-load-fusion", "HasADDILoadFusion",
+                 "Enable ADDI + load macrofusion",
+                 CheckOpcode<[ADDI]>,
+                 CheckOpcode<Load>>;
+
+// Fuse shXadd(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu)
+//   shXadd(.uw) rd, rs1, rs2
+//   load rd, imm12(rd)
+def TuneSHXADDLoadFusion
+  : SimpleFusion<"shxadd-load-fusion", "HasSHXADDLoadFusion",
+                 "Enable SH(1|2|3)ADD(.UW) + load macrofusion",
+                 CheckOpcode<[SH1ADD, SH2ADD, SH3ADD, SH1ADD_UW, SH2ADD_UW, SH3ADD_UW]>,
+                 CheckOpcode<Load>>;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 838edf6..31d2b3a 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -590,12 +590,17 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                              FeatureStdExtZicboz,
                                              FeatureVendorXVentanaCondOps],
                                              [TuneVentanaVeyron,
+                                              TuneDisableMISchedLoadClustering,
+                                              TuneDisablePostMISchedLoadClustering,
+                                              TuneDisablePostMISchedStoreClustering,
                                               TuneLUIADDIFusion,
                                               TuneAUIPCADDIFusion,
                                               TuneZExtHFusion,
                                               TuneZExtWFusion,
                                               TuneShiftedZExtWFusion,
-                                              TuneLDADDFusion]> {
+                                              TuneADDLoadFusion,
+                                              TuneAUIPCLoadFusion,
+                                              TuneLUILoadFusion]> {
   let MVendorID = 0x61f;
   let MArchID = 0x8000000000010000;
   let MImpID = 0x111;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 3f2a83f..66ce134 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -94,16 +94,6 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
-static cl::opt<bool> EnableMISchedLoadStoreClustering(
-    "riscv-misched-load-store-clustering", cl::Hidden,
-    cl::desc("Enable load and store clustering in the machine scheduler"),
-    cl::init(true));
-
-static cl::opt<bool> EnablePostMISchedLoadStoreClustering(
-    "riscv-postmisched-load-store-clustering", cl::Hidden,
-    cl::desc("Enable PostRA load and store clustering in the machine scheduler"),
-    cl::init(true));
-
 static cl::opt<bool> DisableVectorMaskMutation(
     "riscv-disable-vector-mask-mutation",
     cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
@@ -294,15 +284,17 @@ bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
 
 ScheduleDAGInstrs *
 RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
+  const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
   ScheduleDAGMILive *DAG = createSchedLive(C);
-  if (EnableMISchedLoadStoreClustering) {
+
+  if (ST.enableMISchedLoadClustering())
     DAG->addMutation(createLoadClusterDAGMutation(
         DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
+
+  if (ST.enableMISchedStoreClustering())
     DAG->addMutation(createStoreClusterDAGMutation(
         DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
-  }
 
-  const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
   if (!DisableVectorMaskMutation && ST.hasVInstructions())
     DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI));
 
@@ -311,13 +303,16 @@ RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
 
 ScheduleDAGInstrs *
 RISCVTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
+  const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
   ScheduleDAGMI *DAG = createSchedPostRA(C);
-  if (EnablePostMISchedLoadStoreClustering) {
+
+  if (ST.enablePostMISchedLoadClustering())
     DAG->addMutation(createLoadClusterDAGMutation(
         DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
+
+  if (ST.enablePostMISchedStoreClustering())
     DAG->addMutation(createStoreClusterDAGMutation(
         DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
-  }
 
   return DAG;
 }
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index ba09451..6660de9 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVGlobalRegistry.cpp
   SPIRVInstrInfo.cpp
   SPIRVInstructionSelector.cpp
+  SPIRVLegalizeImplicitBinding.cpp
   SPIRVStripConvergentIntrinsics.cpp
   SPIRVLegalizePointerCast.cpp
   SPIRVMergeRegionExitTargets.cpp
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 1688fa3..1934e98 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -23,6 +23,7 @@ ModulePass *createSPIRVPrepareFunctionsPass(const SPIRVTargetMachine &TM);
 FunctionPass *createSPIRVStructurizerPass();
 FunctionPass *createSPIRVMergeRegionExitTargetsPass();
 FunctionPass *createSPIRVStripConvergenceIntrinsicsPass();
+ModulePass *createSPIRVLegalizeImplicitBindingPass();
 FunctionPass *createSPIRVLegalizePointerCastPass(SPIRVTargetMachine *TM);
 FunctionPass *createSPIRVRegularizerPass();
 FunctionPass *createSPIRVPreLegalizerCombiner();
@@ -49,6 +50,7 @@ void initializeSPIRVRegularizerPass(PassRegistry &);
 void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &);
 void initializeSPIRVPrepareFunctionsPass(PassRegistry &);
 void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &);
+void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &);
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
new file mode 100644
index 0000000..0398e52
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -0,0 +1,159 @@
+//===- SPIRVLegalizeImplicitBinding.cpp - Legalize implicit bindings ----*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass legalizes the @llvm.spv.resource.handlefromimplicitbinding
+// intrinsic by replacing it with a call to
+// @llvm.spv.resource.handlefrombinding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+class SPIRVLegalizeImplicitBinding : public ModulePass {
+public:
+  static char ID;
+  SPIRVLegalizeImplicitBinding() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+private:
+  void collectBindingInfo(Module &M);
+  uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
+  void replaceImplicitBindingCalls(Module &M);
+
+  // A map from descriptor set to a bit vector of used binding numbers.
+  std::vector<BitVector> UsedBindings;
+  // A list of all implicit binding calls, to be sorted by order ID.
+  SmallVector<CallInst *, 16> ImplicitBindingCalls;
+};
+
+struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> {
+  std::vector<BitVector> &UsedBindings;
+  SmallVector<CallInst *, 16> &ImplicitBindingCalls;
+
+  BindingInfoCollector(std::vector<BitVector> &UsedBindings,
+                       SmallVector<CallInst *, 16> &ImplicitBindingCalls)
+      : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) {
+  }
+
+  void visitCallInst(CallInst &CI) {
+    if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) {
+      const uint32_t DescSet =
+          cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
+      const uint32_t Binding =
+          cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
+
+      if (UsedBindings.size() <= DescSet) {
+        UsedBindings.resize(DescSet + 1);
+        UsedBindings[DescSet].resize(64);
+      }
+      if (UsedBindings[DescSet].size() <= Binding) {
+        UsedBindings[DescSet].resize(2 * Binding + 1);
+      }
+      UsedBindings[DescSet].set(Binding);
+    } else if (CI.getIntrinsicID() ==
+               Intrinsic::spv_resource_handlefromimplicitbinding) {
+      ImplicitBindingCalls.push_back(&CI);
+    }
+  }
+};
+
+void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
+  BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls);
+  InfoCollector.visit(M);
+
+  // Sort the collected calls by their order ID.
+  std::sort(
+      ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(),
+      [](const CallInst *A, const CallInst *B) {
+        const uint32_t OrderIdArgIdx = 0;
+        const uint32_t OrderA =
+            cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue();
+        const uint32_t OrderB =
+            cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue();
+        return OrderA < OrderB;
+      });
+}
+
+uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
+    uint32_t DescSet) {
+  if (UsedBindings.size() <= DescSet) {
+    UsedBindings.resize(DescSet + 1);
+    UsedBindings[DescSet].resize(64);
+  }
+
+  int NewBinding = UsedBindings[DescSet].find_first_unset();
+  if (NewBinding == -1) {
+    NewBinding = UsedBindings[DescSet].size();
+    UsedBindings[DescSet].resize(2 * NewBinding + 1);
+  }
+
+  UsedBindings[DescSet].set(NewBinding);
+  return NewBinding;
+}
+
+void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
+  for (CallInst *OldCI : ImplicitBindingCalls) {
+    IRBuilder<> Builder(OldCI);
+    const uint32_t DescSet =
+        cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
+    const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet);
+
+    SmallVector<Value *, 8> Args;
+    Args.push_back(Builder.getInt32(DescSet));
+    Args.push_back(Builder.getInt32(NewBinding));
+
+    // Copy the remaining arguments from the old call.
+    for (uint32_t i = 2; i < OldCI->arg_size(); ++i) {
+      Args.push_back(OldCI->getArgOperand(i));
+    }
+
+    Function *NewFunc = Intrinsic::getOrInsertDeclaration(
+        &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType());
+    CallInst *NewCI = Builder.CreateCall(NewFunc, Args);
+    NewCI->setCallingConv(OldCI->getCallingConv());
+
+    OldCI->replaceAllUsesWith(NewCI);
+    OldCI->eraseFromParent();
+  }
+}
+
+bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) {
+  collectBindingInfo(M);
+  if (ImplicitBindingCalls.empty()) {
+    return false;
+  }
+
+  replaceImplicitBindingCalls(M);
+  return true;
+}
+} // namespace
+
+char SPIRVLegalizeImplicitBinding::ID = 0;
+
+INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding",
+                "Legalize SPIR-V implicit bindings", false, false)
+
+ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() {
+  return new SPIRVLegalizeImplicitBinding();
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index d7cf211..e0bfb77 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -226,6 +226,7 @@ void SPIRVPassConfig::addIRPasses() {
 }
 
 void SPIRVPassConfig::addISelPrepare() {
+  addPass(createSPIRVLegalizeImplicitBindingPass());
   addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>()));
   if (TM.getSubtargetImpl()->isLogicalSPIRV())
     addPass(createSPIRVLegalizePointerCastPass(&getTM<SPIRVTargetMachine>()));
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 1aa8efe..c0fc3a6 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
     if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
       continue;
     // The fixed arguments to a varargs function still go in FP registers.
-    if (Outs[VA.getValNo()].IsFixed)
+    if (!Outs[VA.getValNo()].Flags.isVarArg())
       continue;
 
     // This floating point argument should be reassigned.
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 25f4aac..fbb98ff 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -31,10 +31,6 @@ namespace SystemZ {
 
 class SystemZCCState : public CCState {
 private:
-  /// Records whether the value was a fixed argument.
-  /// See ISD::OutputArg::IsFixed.
-  SmallVector<bool, 4> ArgIsFixed;
-
   /// Records whether the value was widened from a short vector type.
   SmallVector<bool, 4> ArgIsShortVector;
 
@@ -50,10 +46,6 @@ public:
 
   void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
                               CCAssignFn Fn) {
-    // Formal arguments are always fixed.
-    ArgIsFixed.clear();
-    for (unsigned i = 0; i < Ins.size(); ++i)
-      ArgIsFixed.push_back(true);
     // Record whether the call operand was a short vector.
     ArgIsShortVector.clear();
     for (unsigned i = 0; i < Ins.size(); ++i)
@@ -64,10 +56,6 @@ public:
 
   void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                            CCAssignFn Fn) {
-    // Record whether the call operand was a fixed argument.
-    ArgIsFixed.clear();
-    for (unsigned i = 0; i < Outs.size(); ++i)
-      ArgIsFixed.push_back(Outs[i].IsFixed);
     // Record whether the call operand was a short vector.
     ArgIsShortVector.clear();
     for (unsigned i = 0; i < Outs.size(); ++i)
@@ -77,12 +65,11 @@ public:
   }
 
   // This version of AnalyzeCallOperands in the base class is not usable
-  // since we must provide a means of accessing ISD::OutputArg::IsFixed.
+  // since we must provide a means of accessing ISD::OutputArg::IsShortVector.
   void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
                            SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
                            CCAssignFn Fn) = delete;
 
-  bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
   bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
 };
 
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 0ad872b..059f31f 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A>
                     "getSubtarget<SystemZSubtarget>().", F),
          A>;
 
-// Match if this specific argument is a fixed (i.e. named) argument.
-class CCIfFixed<CCAction A>
-    : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
-
-// Match if this specific argument is not a fixed (i.e. vararg) argument.
-class CCIfNotFixed<CCAction A>
-    : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>;
-
 // Match if this specific argument was widened from a short vector type.
 class CCIfShortVector<CCAction A>
     : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
@@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[
   // Pass in STG registers: XMM1, ..., XMM6
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-             CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
+             CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>,
 
   // Fail otherwise
   CCCustom<"CC_SystemZ_GHC_Error">
@@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[
   // during type legalization.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-             CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
-                                      V25, V27, V29, V31]>>>>,
+             CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
+                                         V25, V27, V29, V31]>>>>,
 
   // However, sub-128 vectors which need to go on the stack occupy just a
   // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
@@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs.
   // Although we assign the f32 vararg to be bitcast, it will first be promoted
   // to an f64 within convertValVTToLocVT().
-  CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+  CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>,
   // Pointers are always passed in full 64-bit registers.
   CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>,
   // long double, can only be passed in GPR2 and GPR3, if available,
   // hence R2Q
-  CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+  CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
   // Non fixed vector arguments are treated in the same way as long
   // doubles.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-      CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
+      CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
 
   // A SwiftSelf is passed in callee-saved R10.
   CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
@@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // during type legalization.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-      CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
+      CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-              CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
-                                               V28, V29, V30, V31], 16, 8>>>>,
+              CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
+                                                  V28, V29, V30, V31], 16, 8>>>>,
 
   // The first 4 named float and double arguments are passed in registers
   // FPR0-FPR6. The rest will be passed in the user area.
-  CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
-  CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
-  CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
+  CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f32],
+           CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
+  CCIfType<[f64],
+           CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
 
   // The first 2 long double arguments are passed in register FPR0/FPR2
   // and FPR4/FPR6. The rest will be passed in the user area.
-  CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
-  CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
+  CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
 
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 6297916..5ee66e3 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -574,13 +574,11 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Call mcount (Regmask from CC AnyReg since mcount preserves all normal
     // argument registers).
-    FunctionCallee FC = MF.getFunction().getParent()->getOrInsertFunction(
-        "mcount", Type::getVoidTy(MF.getFunction().getContext()));
     const uint32_t *Mask = MF.getSubtarget<SystemZSubtarget>()
                                .getSpecialRegisters()
                                ->getCallPreservedMask(MF, CallingConv::AnyReg);
     BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::CallBRASL))
-        .addGlobalAddress(dyn_cast<Function>(FC.getCallee()))
+        .addExternalSymbol("mcount")
         .addRegMask(Mask);
 
     // Reload return address from 8 bytes above stack pointer.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3f80b2a..f9eba4b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       OutVal = FINode;
     }
     // Count the number of fixed args *after* legalization.
-    NumFixedArgs += Out.IsFixed;
+    NumFixedArgs += !Out.Flags.isVarArg();
   }
 
   bool IsVarArg = CLI.IsVarArg;
@@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
   for (const ISD::OutputArg &Out : Outs) {
     assert(!Out.Flags.isByVal() && "byval is not valid for return values");
     assert(!Out.Flags.isNest() && "nest is not valid for return values");
-    assert(Out.IsFixed && "non-fixed return value is not valid");
+    assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid");
     if (Out.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
     if (Out.Flags.isInConsecutiveRegs())
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index c0a6035..d9f4405 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -75,7 +75,7 @@ public:
     static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
                                            X86::XMM3, X86::XMM4, X86::XMM5,
                                            X86::XMM6, X86::XMM7};
-    if (!Info.IsFixed)
+    if (Flags.isVarArg())
       NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
 
     return Res;
@@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                      Info.CallConv, Info.IsVarArg))
     return false;
 
-  bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+  bool IsFixed =
+      Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg();
   if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 7e09d30..22192e1f 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TargetParser/Host.h"
+#include "llvm/ADT/Bitfields.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -167,35 +170,10 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Default(generic);
 }
 
-StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
-  // The cpuid register on arm is not accessible from user space. On Linux,
-  // it is exposed through the /proc/cpuinfo file.
-
-  // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
-  // in all cases.
-  SmallVector<StringRef, 32> Lines;
-  ProcCpuinfoContent.split(Lines, '\n');
-
-  // Look for the CPU implementer and hardware lines, and store the CPU part
-  // numbers found.
-  StringRef Implementer;
-  StringRef Hardware;
-  SmallVector<StringRef, 32> Parts;
-  for (StringRef Line : Lines) {
-    if (Line.consume_front("CPU implementer"))
-      Implementer = Line.ltrim("\t :");
-    else if (Line.consume_front("Hardware"))
-      Hardware = Line.ltrim("\t :");
-    else if (Line.consume_front("CPU part"))
-      Parts.emplace_back(Line.ltrim("\t :"));
-  }
-
-  // Last `Part' seen, in case we don't analyse all `Parts' parsed.
-  StringRef Part = Parts.empty() ? StringRef() : Parts.back();
-
-  // Remove duplicate `Parts'.
-  llvm::sort(Parts);
-  Parts.erase(llvm::unique(Parts), Parts.end());
+StringRef
+getHostCPUNameForARMFromComponents(StringRef Implementer, StringRef Hardware,
+                                   StringRef Part, ArrayRef<StringRef> Parts,
+                                   function_ref<unsigned()> GetVariant) {
 
   auto MatchBigLittle = [](auto const &Parts, StringRef Big, StringRef Little) {
     if (Parts.size() == 2)
@@ -343,21 +321,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
     // The Exynos chips have a convoluted ID scheme that doesn't seem to follow
     // any predictive pattern across variants and parts.
-    unsigned Variant = 0, Part = 0;
 
     // Look for the CPU variant line, whose value is a 1 digit hexadecimal
     // number, corresponding to the Variant bits in the CP15/C0 register.
-    for (auto I : Lines)
-      if (I.consume_front("CPU variant"))
-        I.ltrim("\t :").getAsInteger(0, Variant);
+    unsigned Variant = GetVariant();
 
-    // Look for the CPU part line, whose value is a 3 digit hexadecimal
-    // number, corresponding to the PartNum bits in the CP15/C0 register.
-    for (auto I : Lines)
-      if (I.consume_front("CPU part"))
-        I.ltrim("\t :").getAsInteger(0, Part);
+    // Convert the CPU part line, whose value is a 3 digit hexadecimal number,
+    // corresponding to the PartNum bits in the CP15/C0 register.
+    unsigned PartAsInt;
+    Part.getAsInteger(0, PartAsInt);
 
-    unsigned Exynos = (Variant << 12) | Part;
+    unsigned Exynos = (Variant << 12) | PartAsInt;
     switch (Exynos) {
     default:
       // Default by falling through to Exynos M3.
@@ -416,6 +390,78 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   return "generic";
 }
 
+StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+
+  // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, '\n');
+
+  // Look for the CPU implementer and hardware lines, and store the CPU part
+  // numbers found.
+  StringRef Implementer;
+  StringRef Hardware;
+  SmallVector<StringRef, 32> Parts;
+  for (StringRef Line : Lines) {
+    if (Line.consume_front("CPU implementer"))
+      Implementer = Line.ltrim("\t :");
+    else if (Line.consume_front("Hardware"))
+      Hardware = Line.ltrim("\t :");
+    else if (Line.consume_front("CPU part"))
+      Parts.emplace_back(Line.ltrim("\t :"));
+  }
+
+  // Last `Part' seen, in case we don't analyse all `Parts' parsed.
+  StringRef Part = Parts.empty() ? StringRef() : Parts.back();
+
+  // Remove duplicate `Parts'.
+  llvm::sort(Parts);
+  Parts.erase(llvm::unique(Parts), Parts.end());
+
+  auto GetVariant = [&]() {
+    unsigned Variant = 0;
+    for (auto I : Lines)
+      if (I.consume_front("CPU variant"))
+        I.ltrim("\t :").getAsInteger(0, Variant);
+    return Variant;
+  };
+
+  return getHostCPUNameForARMFromComponents(Implementer, Hardware, Part, Parts,
+                                            GetVariant);
+}
+
+StringRef sys::detail::getHostCPUNameForARM(uint64_t PrimaryCpuInfo,
+                                            ArrayRef<uint64_t> UniqueCpuInfos) {
+  // On Windows, the registry provides cached copied of the MIDR_EL1 register.
+  using PartNum = Bitfield::Element<uint16_t, 4, 12>;
+  using Implementer = Bitfield::Element<uint16_t, 24, 8>;
+  using Variant = Bitfield::Element<uint16_t, 20, 4>;
+
+  SmallVector<std::string> PartsHolder;
+  PartsHolder.reserve(UniqueCpuInfos.size());
+  for (auto Info : UniqueCpuInfos)
+    PartsHolder.push_back("0x" + utohexstr(Bitfield::get<PartNum>(Info),
+                                           /*LowerCase*/ true,
+                                           /*Width*/ 3));
+
+  SmallVector<StringRef> Parts;
+  Parts.reserve(PartsHolder.size());
+  for (const auto &Part : PartsHolder)
+    Parts.push_back(Part);
+
+  return getHostCPUNameForARMFromComponents(
+      "0x" + utohexstr(Bitfield::get<Implementer>(PrimaryCpuInfo),
+                       /*LowerCase*/ true,
+                       /*Width*/ 2),
+      /*Hardware*/ "",
+      "0x" + utohexstr(Bitfield::get<PartNum>(PrimaryCpuInfo),
+                       /*LowerCase*/ true,
+                       /*Width*/ 3),
+      Parts, [=]() { return Bitfield::get<Variant>(PrimaryCpuInfo); });
+}
+
 namespace {
 StringRef getCPUNameFromS390Model(unsigned int Id, bool HaveVectorSupport) {
   switch (Id) {
@@ -1450,6 +1496,75 @@ StringRef sys::getHostCPUName() {
   return "generic";
 }
 
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+StringRef sys::getHostCPUName() {
+  constexpr char CentralProcessorKeyName[] =
+      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor";
+  // Sub keys names are simple numbers ("0", "1", etc.) so 10 chars should be
+  // enough for the slash and name.
+  constexpr size_t SubKeyNameMaxSize = ARRAYSIZE(CentralProcessorKeyName) + 10;
+
+  SmallVector<uint64_t> Values;
+  uint64_t PrimaryCpuInfo;
+  char PrimaryPartKeyName[SubKeyNameMaxSize];
+  DWORD PrimaryPartKeyNameSize = 0;
+  HKEY CentralProcessorKey;
+  if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, CentralProcessorKeyName, 0, KEY_READ,
+                    &CentralProcessorKey) == ERROR_SUCCESS) {
+    for (unsigned Index = 0; Index < UINT32_MAX; ++Index) {
+      char SubKeyName[SubKeyNameMaxSize];
+      DWORD SubKeySize = SubKeyNameMaxSize;
+      HKEY SubKey;
+      if ((RegEnumKeyExA(CentralProcessorKey, Index, SubKeyName, &SubKeySize,
+                         nullptr, nullptr, nullptr,
+                         nullptr) == ERROR_SUCCESS) &&
+          (RegOpenKeyExA(CentralProcessorKey, SubKeyName, 0, KEY_READ,
+                         &SubKey) == ERROR_SUCCESS)) {
+        // The "CP 4000" registry key contains a cached copy of the MIDR_EL1
+        // register.
+        uint64_t RegValue;
+        DWORD ActualType;
+        DWORD RegValueSize = sizeof(RegValue);
+        if ((RegQueryValueExA(SubKey, "CP 4000", nullptr, &ActualType,
+                              (PBYTE)&RegValue,
+                              &RegValueSize) == ERROR_SUCCESS) &&
+            (ActualType == REG_QWORD) && RegValueSize == sizeof(RegValue)) {
+          // Assume that the part with the "highest" reg key name is the primary
+          // part (to match the way that Linux's cpuinfo is written). Win32
+          // makes no guarantees about the order of sub keys, so we have to
+          // compare the names.
+          if (PrimaryPartKeyNameSize < SubKeySize ||
+              (PrimaryPartKeyNameSize == SubKeySize &&
+               ::memcmp(SubKeyName, PrimaryPartKeyName, SubKeySize) > 0)) {
+            PrimaryCpuInfo = RegValue;
+            ::memcpy(PrimaryPartKeyName, SubKeyName, SubKeySize + 1);
+            PrimaryPartKeyNameSize = SubKeySize;
+          }
+          if (!llvm::is_contained(Values, RegValue)) {
+            Values.push_back(RegValue);
+          }
+        }
+        RegCloseKey(SubKey);
+      } else {
+        // No more sub keys.
+        break;
+      }
+    }
+    RegCloseKey(CentralProcessorKey);
+  }
+
+  if (Values.empty()) {
+    return "generic";
+  }
+
+  // Win32 makes no guarantees about the order of sub keys, so sort to ensure
+  // reproducibility.
+  llvm::sort(Values);
+
+  return detail::getHostCPUNameForARM(PrimaryCpuInfo, Values);
+}
+
 #elif defined(__APPLE__) && defined(__powerpc__)
 StringRef sys::getHostCPUName() {
   host_basic_info_data_t hostInfo;
diff --git a/llvm/lib/TextAPI/Architecture.cpp b/llvm/lib/TextAPI/Architecture.cpp
index 51ca91d..3b53067 100644
--- a/llvm/lib/TextAPI/Architecture.cpp
+++ b/llvm/lib/TextAPI/Architecture.cpp
@@ -21,7 +21,7 @@ namespace llvm {
 namespace MachO {
 
 Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) {
-#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
+#define ARCHINFO(Arch, Name, Type, Subtype, NumBits)                           \
   if (CPUType == (Type) &&                                                     \
       (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) == (Subtype))                    \
     return AK_##Arch;
@@ -33,7 +33,7 @@ Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) {
 
 Architecture getArchitectureFromName(StringRef Name) {
   return StringSwitch<Architecture>(Name)
-#define ARCHINFO(Arch, Type, Subtype, NumBits) .Case(#Arch, AK_##Arch)
+#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) .Case(#Name, AK_##Arch)
 #include "llvm/TextAPI/Architecture.def"
 #undef ARCHINFO
       .Default(AK_unknown);
@@ -41,9 +41,9 @@ Architecture getArchitectureFromName(StringRef Name) {
 
 StringRef getArchitectureName(Architecture Arch) {
   switch (Arch) {
-#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
+#define ARCHINFO(Arch, Name, Type, Subtype, NumBits)                           \
   case AK_##Arch:                                                              \
-    return #Arch;
+    return #Name;
 #include "llvm/TextAPI/Architecture.def"
 #undef ARCHINFO
   case AK_unknown:
@@ -57,7 +57,7 @@ StringRef getArchitectureName(Architecture Arch) {
 
 std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch) {
   switch (Arch) {
-#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
+#define ARCHINFO(Arch, Name, Type, Subtype, NumBits)                           \
   case AK_##Arch:                                                              \
     return std::make_pair(Type, Subtype);
 #include "llvm/TextAPI/Architecture.def"
@@ -77,7 +77,7 @@ Architecture mapToArchitecture(const Triple &Target) {
 
 bool is64Bit(Architecture Arch) {
   switch (Arch) {
-#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
+#define ARCHINFO(Arch, Name, Type, Subtype, NumBits)                           \
   case AK_##Arch:                                                              \
     return NumBits == 64;
 #include "llvm/TextAPI/Architecture.def"
diff --git a/llvm/lib/TextAPI/TextStubCommon.cpp b/llvm/lib/TextAPI/TextStubCommon.cpp
index 0b710b0..7bf1f9a 100644
--- a/llvm/lib/TextAPI/TextStubCommon.cpp
+++ b/llvm/lib/TextAPI/TextStubCommon.cpp
@@ -133,7 +133,7 @@ QuotingType ScalarTraits<PlatformSet>::mustQuote(StringRef) {
 
 void ScalarBitSetTraits<ArchitectureSet>::bitset(IO &IO,
                                                  ArchitectureSet &Archs) {
-#define ARCHINFO(arch, type, subtype, numbits)                                 \
+#define ARCHINFO(arch, name, type, subtype, numbits)                           \
   IO.bitSetCase(Archs, #arch, 1U << static_cast<int>(AK_##arch));
 #include "llvm/TextAPI/Architecture.def"
 #undef ARCHINFO
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index ed3dca2..59a47a9 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2361,15 +2361,13 @@ remapIndices(Function &Caller, BasicBlock *StartBB,
 // Updating the contextual profile after an inlining means, at a high level,
 // copying over the data of the callee, **intentionally without any value
 // scaling**, and copying over the callees of the inlined callee.
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
-                                        PGOContextualProfile &CtxProf,
-                                        bool MergeAttributes,
-                                        AAResults *CalleeAAR,
-                                        bool InsertLifetime,
-                                        Function *ForwardVarArgsTo) {
+llvm::InlineResult llvm::InlineFunction(
+    CallBase &CB, InlineFunctionInfo &IFI, PGOContextualProfile &CtxProf,
+    bool MergeAttributes, AAResults *CalleeAAR, bool InsertLifetime,
+    Function *ForwardVarArgsTo, OptimizationRemarkEmitter *ORE) {
   if (!CtxProf.isInSpecializedModule())
     return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
-                          ForwardVarArgsTo);
+                          ForwardVarArgsTo, ORE);
 
   auto &Caller = *CB.getCaller();
   auto &Callee = *CB.getCalledFunction();
@@ -2387,7 +2385,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   const auto NumCalleeCallsites = CtxProf.getNumCallsites(Callee);
 
   auto Ret = InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
-                            ForwardVarArgsTo);
+                            ForwardVarArgsTo, ORE);
   if (!Ret.isSuccess())
     return Ret;
 
@@ -2457,20 +2455,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   return Ret;
 }
 
-/// This function inlines the called function into the basic block of the
-/// caller. This returns false if it is not possible to inline this call.
-/// The program is still in a well defined state if this occurs though.
-///
-/// Note that this only does one level of inlining.  For example, if the
-/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-/// exists in the instruction stream.  Similarly this will inline a recursive
-/// function by one level.
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
-                                        bool MergeAttributes,
-                                        AAResults *CalleeAAR,
-                                        bool InsertLifetime,
-                                        Function *ForwardVarArgsTo,
-                                        OptimizationRemarkEmitter *ORE) {
+llvm::InlineResult llvm::CanInlineCallSite(const CallBase &CB,
+                                           InlineFunctionInfo &IFI) {
   assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
 
   // FIXME: we don't inline callbr yet.
@@ -2487,7 +2473,6 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
-  Value *ConvergenceControlToken = nullptr;
   if (CB.hasOperandBundles()) {
     for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
       auto OBUse = CB.getOperandBundleAt(i);
@@ -2503,7 +2488,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       if (Tag == LLVMContext::OB_kcfi)
         continue;
       if (Tag == LLVMContext::OB_convergencectrl) {
-        ConvergenceControlToken = OBUse.Inputs[0].get();
+        IFI.ConvergenceControlToken = OBUse.Inputs[0].get();
         continue;
       }
 
@@ -2521,28 +2506,22 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   // fully implements convergence control tokens, there is no mixing of
   // controlled and uncontrolled convergent operations in the whole program.
   if (CB.isConvergent()) {
-    if (!ConvergenceControlToken &&
+    if (!IFI.ConvergenceControlToken &&
         getConvergenceEntry(CalledFunc->getEntryBlock())) {
       return InlineResult::failure(
           "convergent call needs convergencectrl operand");
     }
   }
 
-  // If the call to the callee cannot throw, set the 'nounwind' flag on any
-  // calls that we inline.
-  bool MarkNoUnwind = CB.doesNotThrow();
-
-  BasicBlock *OrigBB = CB.getParent();
-  Function *Caller = OrigBB->getParent();
+  const BasicBlock *OrigBB = CB.getParent();
+  const Function *Caller = OrigBB->getParent();
 
   // GC poses two hazards to inlining, which only occur when the callee has GC:
   //  1. If the caller has no GC, then the callee's GC must be propagated to the
   //     caller.
   //  2. If the caller has a differing GC, it is invalid to inline.
   if (CalledFunc->hasGC()) {
-    if (!Caller->hasGC())
-      Caller->setGC(CalledFunc->getGC());
-    else if (CalledFunc->getGC() != Caller->getGC())
+    if (Caller->hasGC() && CalledFunc->getGC() != Caller->getGC())
       return InlineResult::failure("incompatible GC");
   }
 
@@ -2560,34 +2539,31 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           ? Caller->getPersonalityFn()->stripPointerCasts()
           : nullptr;
   if (CalledPersonality) {
-    if (!CallerPersonality)
-      Caller->setPersonalityFn(CalledPersonality);
     // If the personality functions match, then we can perform the
     // inlining. Otherwise, we can't inline.
     // TODO: This isn't 100% true. Some personality functions are proper
     //       supersets of others and can be used in place of the other.
-    else if (CalledPersonality != CallerPersonality)
+    if (CallerPersonality && CalledPersonality != CallerPersonality)
       return InlineResult::failure("incompatible personality");
   }
 
   // We need to figure out which funclet the callsite was in so that we may
   // properly nest the callee.
-  Instruction *CallSiteEHPad = nullptr;
   if (CallerPersonality) {
     EHPersonality Personality = classifyEHPersonality(CallerPersonality);
     if (isScopedEHPersonality(Personality)) {
       std::optional<OperandBundleUse> ParentFunclet =
           CB.getOperandBundle(LLVMContext::OB_funclet);
       if (ParentFunclet)
-        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+        IFI.CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
 
       // OK, the inlining site is legal.  What about the target function?
 
-      if (CallSiteEHPad) {
+      if (IFI.CallSiteEHPad) {
         if (Personality == EHPersonality::MSVC_CXX) {
           // The MSVC personality cannot tolerate catches getting inlined into
           // cleanup funclets.
-          if (isa<CleanupPadInst>(CallSiteEHPad)) {
+          if (isa<CleanupPadInst>(IFI.CallSiteEHPad)) {
             // Ok, the call site is within a cleanuppad.  Let's check the callee
             // for catchpads.
             for (const BasicBlock &CalledBB : *CalledFunc) {
@@ -2607,13 +2583,34 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     }
   }
 
+  return InlineResult::success();
+}
+
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// function by one level.
+void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
+                              bool MergeAttributes, AAResults *CalleeAAR,
+                              bool InsertLifetime, Function *ForwardVarArgsTo,
+                              OptimizationRemarkEmitter *ORE) {
+  BasicBlock *OrigBB = CB.getParent();
+  Function *Caller = OrigBB->getParent();
+  Function *CalledFunc = CB.getCalledFunction();
+  assert(CalledFunc && !CalledFunc->isDeclaration() &&
+         "CanInlineCallSite should have verified direct call to definition");
+
   // Determine if we are dealing with a call in an EHPad which does not unwind
   // to caller.
   bool EHPadForCallUnwindsLocally = false;
-  if (CallSiteEHPad && isa<CallInst>(CB)) {
+  if (IFI.CallSiteEHPad && isa<CallInst>(CB)) {
     UnwindDestMemoTy FuncletUnwindMap;
     Value *CallSiteUnwindDestToken =
-        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+        getUnwindDestToken(IFI.CallSiteEHPad, FuncletUnwindMap);
 
     EHPadForCallUnwindsLocally =
         CallSiteUnwindDestToken &&
@@ -2630,6 +2627,30 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   ClonedCodeInfo InlinedFunctionInfo;
   Function::iterator FirstNewBlock;
 
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else {
+      assert(CalledFunc->getGC() == Caller->getGC() &&
+             "CanInlineCallSite should have verified compatible GCs");
+    }
+  }
+
+  if (CalledFunc->hasPersonalityFn()) {
+    Constant *CalledPersonality =
+        CalledFunc->getPersonalityFn()->stripPointerCasts();
+    if (!Caller->hasPersonalityFn()) {
+      Caller->setPersonalityFn(CalledPersonality);
+    } else
+      assert(Caller->getPersonalityFn()->stripPointerCasts() ==
+                 CalledPersonality &&
+             "CanInlineCallSite should have verified compatible personality");
+  }
+
   { // Scope to destroy VMap after cloning.
     ValueToValueMapTy VMap;
     struct ByValInit {
@@ -2819,10 +2840,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
             IFI.GetAssumptionCache(*Caller).registerAssumption(II);
   }
 
-  if (ConvergenceControlToken) {
+  if (IFI.ConvergenceControlToken) {
     IntrinsicInst *IntrinsicCall = getConvergenceEntry(*FirstNewBlock);
     if (IntrinsicCall) {
-      IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken);
+      IntrinsicCall->replaceAllUsesWith(IFI.ConvergenceControlToken);
       IntrinsicCall->eraseFromParent();
     }
   }
@@ -2869,6 +2890,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     }
   }
 
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CB.doesNotThrow();
+
   SmallVector<Value*,4> VarArgsToForward;
   SmallVector<AttributeSet, 4> VarArgsAttrs;
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
@@ -3055,12 +3080,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   // Update the lexical scopes of the new funclets and callsites.
   // Anything that had 'none' as its parent is now nested inside the callsite's
   // EHPad.
-  if (CallSiteEHPad) {
+  if (IFI.CallSiteEHPad) {
     for (Function::iterator BB = FirstNewBlock->getIterator(),
                             E = Caller->end();
          BB != E; ++BB) {
       // Add bundle operands to inlined call sites.
-      PropagateOperandBundles(BB, CallSiteEHPad);
+      PropagateOperandBundles(BB, IFI.CallSiteEHPad);
 
       // It is problematic if the inlinee has a cleanupret which unwinds to
       // caller and we inline it into a call site which doesn't unwind but into
@@ -3076,11 +3101,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
       if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
         if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
-          CatchSwitch->setParentPad(CallSiteEHPad);
+          CatchSwitch->setParentPad(IFI.CallSiteEHPad);
       } else {
         auto *FPI = cast<FuncletPadInst>(I);
         if (isa<ConstantTokenNone>(FPI->getParentPad()))
-          FPI->setParentPad(CallSiteEHPad);
+          FPI->setParentPad(IFI.CallSiteEHPad);
       }
     }
   }
@@ -3236,7 +3261,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc);
 
     // We are now done with the inlining.
-    return InlineResult::success();
+    return;
   }
 
   // Otherwise, we have the normal case, of more than one block to inline or
@@ -3404,6 +3429,19 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   if (MergeAttributes)
     AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc);
+}
 
-  return InlineResult::success();
+llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
+                                        bool MergeAttributes,
+                                        AAResults *CalleeAAR,
+                                        bool InsertLifetime,
+                                        Function *ForwardVarArgsTo,
+                                        OptimizationRemarkEmitter *ORE) {
+  llvm::InlineResult Result = CanInlineCallSite(CB, IFI);
+  if (Result.isSuccess()) {
+    InlineFunctionImpl(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime,
+                       ForwardVarArgsTo, ORE);
+  }
+
+  return Result;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9667b50..be00fd6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7530,9 +7530,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   EPI.MainLoopIterationCountCheck =
       emitIterationCountCheck(LoopScalarPreHeader, false);
 
-  // Generate the induction variable.
-  EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
-
   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
   return LoopVectorPreHeader;
 }
@@ -8451,11 +8448,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                  *Plan, CM.getMinimalBitwidths());
       VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
       // TODO: try to put it close to addActiveLaneMask().
-      // Discard the plan if it is not EVL-compatible
-      if (CM.foldTailWithEVL() && !HasScalarVF &&
-          !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
-                                    *Plan, CM.getMaxSafeElements()))
-        break;
+      if (CM.foldTailWithEVL() && !HasScalarVF)
+        VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
+                                 *Plan, CM.getMaxSafeElements());
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -9810,7 +9805,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
 static void
 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
                                  const SCEV2ValueTy &ExpandedSCEVs,
-                                 const EpilogueLoopVectorizationInfo &EPI) {
+                                 EpilogueLoopVectorizationInfo &EPI) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
   Header->setName("vec.epilog.vector.body");
@@ -9828,30 +9823,13 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
       // loop.
       using namespace llvm::PatternMatch;
       PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
-      assert(EPResumeVal->getType() == IV->getScalarType() &&
-             match(EPResumeVal->getIncomingValueForBlock(
-                       EPI.MainLoopIterationCountCheck),
-                   m_SpecificInt(0)) &&
-             EPResumeVal ==
-                 find_singleton<PHINode>(
-                     L->getLoopPreheader()->phis(),
-                     [&EPI, IV](PHINode &P, bool) -> PHINode * {
-                       if (P.getType() == IV->getScalarType() &&
-                           match(P.getIncomingValueForBlock(
-                                     EPI.MainLoopIterationCountCheck),
-                                 m_SpecificInt(0)) &&
-                           any_of(P.incoming_values(),
-                                  [&EPI](Value *Inc) {
-                                    return Inc == EPI.VectorTripCount;
-                                  }) &&
-                           all_of(P.incoming_values(), [&EPI](Value *Inc) {
-                             return Inc == EPI.VectorTripCount ||
-                                    match(Inc, m_SpecificInt(0));
-                           }))
-                         return &P;
-                       return nullptr;
-                     }) &&
-             "Epilogue resume phis do not match!");
+      for (Value *Inc : EPResumeVal->incoming_values()) {
+        if (match(Inc, m_SpecificInt(0)))
+          continue;
+        assert(!EPI.VectorTripCount &&
+               "Must only have a single non-zero incoming value");
+        EPI.VectorTripCount = Inc;
+      }
       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
       assert(all_of(IV->users(),
                     [](const VPUser *U) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 62ab3f52..5d0e2f9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15097,7 +15097,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   for (ExternalUser &EU : ExternalUses) {
     LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
                       << EU.E.Idx << " in lane " << EU.Lane << "\n");
-    LLVM_DEBUG(dbgs() << "  User:" << *EU.User << "\n");
+    LLVM_DEBUG(if (EU.User) dbgs() << "  User:" << *EU.User << "\n";
+               else dbgs() << "  User: nullptr\n");
     LLVM_DEBUG(dbgs() << "  Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
 
     // Uses by ephemeral values are free (because the ephemeral value will be
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8052e31..73babcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1054,12 +1054,17 @@ void VPlan::execute(VPTransformState *State) {
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
-  // blocks, like the preheader or middle blocks.
+  // blocks, like the preheader or middle blocks, expect for checking them for
+  // recipes with invalid costs.
   InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
 
-  // If any instructions in the middle block are invalid return invalid.
-  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
-  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+  // If the cost of the loop region is invalid or any recipe in the skeleton
+  // outside loop regions are invalid return an invalid cost.
+  if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+                                    vp_depth_first_shallow(getEntry())),
+                                [&VF, &Ctx](VPBasicBlock *VPBB) {
+                                  return !VPBB->cost(VF, Ctx).isValid();
+                                }))
     return InstructionCost::getInvalid();
 
   return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d133610..8818843 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -461,6 +461,66 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
 }
 
+/// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison
+/// predicate.
+template <typename Op0_t, typename Op1_t> struct ICmp_match {
+  CmpPredicate *Predicate = nullptr;
+  Op0_t Op0;
+  Op1_t Op1;
+
+  ICmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1)
+      : Predicate(&Pred), Op0(Op0), Op1(Op1) {}
+  ICmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {}
+
+  bool match(const VPValue *V) const {
+    auto *DefR = V->getDefiningRecipe();
+    return DefR && match(DefR);
+  }
+
+  bool match(const VPRecipeBase *V) const {
+    if (m_Binary<Instruction::ICmp>(Op0, Op1).match(V)) {
+      if (Predicate)
+        *Predicate = cast<VPRecipeWithIRFlags>(V)->getPredicate();
+      return true;
+    }
+    return false;
+  }
+};
+
+/// SpecificICmp_match is a variant of ICmp_match that matches the comparison
+/// predicate, instead of binding it.
+template <typename Op0_t, typename Op1_t> struct SpecificICmp_match {
+  const CmpPredicate Predicate;
+  Op0_t Op0;
+  Op1_t Op1;
+
+  SpecificICmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS)
+      : Predicate(Pred), Op0(LHS), Op1(RHS) {}
+
+  bool match(const VPValue *V) const {
+    CmpPredicate CurrentPred;
+    return ICmp_match<Op0_t, Op1_t>(CurrentPred, Op0, Op1).match(V) &&
+           CmpPredicate::getMatching(CurrentPred, Predicate);
+  }
+};
+
+template <typename Op0_t, typename Op1_t>
+inline ICmp_match<Op0_t, Op1_t> m_ICmp(const Op0_t &Op0, const Op1_t &Op1) {
+  return ICmp_match<Op0_t, Op1_t>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline ICmp_match<Op0_t, Op1_t> m_ICmp(CmpPredicate &Pred, const Op0_t &Op0,
+                                       const Op1_t &Op1) {
+  return ICmp_match<Op0_t, Op1_t>(Pred, Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SpecificICmp_match<Op0_t, Op1_t>
+m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
+  return SpecificICmp_match<Op0_t, Op1_t>(MatchPred, Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t>
 using GEPLikeRecipe_match =
     BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1a71a75..1c8bd6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1164,6 +1164,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (auto *Phi = dyn_cast<VPPhi>(Def)) {
+    if (Phi->getNumOperands() == 1)
+      Phi->replaceAllUsesWith(Phi->getOperand(0));
+    return;
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
@@ -1385,11 +1391,10 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
 
     // Currently only handle cases where the single user is a header-mask
     // comparison with the backedge-taken-count.
-    if (!match(
-            *WideIV->user_begin(),
-            m_Binary<Instruction::ICmp>(
-                m_Specific(WideIV),
-                m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
+    if (!match(*WideIV->user_begin(),
+               m_ICmp(m_Specific(WideIV),
+                      m_Broadcast(
+                          m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
       continue;
 
     // Update IV operands and comparison bound to use new narrower type.
@@ -1422,11 +1427,9 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
     });
 
   auto *CanIV = Plan.getCanonicalIV();
-  if (!match(Cond, m_Binary<Instruction::ICmp>(
-                       m_Specific(CanIV->getBackedgeValue()),
-                       m_Specific(&Plan.getVectorTripCount()))) ||
-      cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
-          CmpInst::ICMP_EQ)
+  if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ,
+                                  m_Specific(CanIV->getBackedgeValue()),
+                                  m_Specific(&Plan.getVectorTripCount()))))
     return false;
 
   // The compare checks CanIV + VFxUF == vector trip count. The vector trip
@@ -1835,7 +1838,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
         VPW->dropPoisonGeneratingFlags();
 
       if (OldResSizeInBits != NewResSizeInBits &&
-          !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
+          !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
         // Extend result to original width.
         auto *Ext =
             new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
@@ -1844,9 +1847,8 @@ void VPlanTransforms::truncateToMinimalBitwidths(
         Ext->setOperand(0, ResultVPV);
         assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
       } else {
-        assert(
-            match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
-            "Only ICmps should not need extending the result.");
+        assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
+               "Only ICmps should not need extending the result.");
       }
 
       assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
@@ -2183,6 +2185,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
          "User of VF that we can't transform to EVL.");
   Plan.getVF().replaceAllUsesWith(&EVL);
 
+  assert(all_of(Plan.getVFxUF().users(),
+                [&Plan](VPUser *U) {
+                  return match(U, m_c_Binary<Instruction::Add>(
+                                      m_Specific(Plan.getCanonicalIV()),
+                                      m_Specific(&Plan.getVFxUF()))) ||
+                         isa<VPWidenPointerInductionRecipe>(U);
+                }) &&
+         "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
+         "increment of the canonical induction.");
+  Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+    // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
+    // canonical induction must not be updated.
+    return isa<VPWidenPointerInductionRecipe>(U);
+  });
+
   // Defer erasing recipes till the end so that we don't invalidate the
   // VPTypeAnalysis cache.
   SmallVector<VPRecipeBase *> ToErase;
@@ -2318,16 +2335,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 /// %NextAVL = sub IVSize nuw %AVL, %OpEVL
 /// ...
 ///
-bool VPlanTransforms::tryAddExplicitVectorLength(
+void VPlanTransforms::addExplicitVectorLength(
     VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
-  // The transform updates all users of inductions to work based on EVL, instead
-  // of the VF directly. At the moment, widened pointer inductions cannot be
-  // updated, so bail out if the plan contains any.
-  bool ContainsWidenPointerInductions =
-      any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
-  if (ContainsWidenPointerInductions)
-    return false;
 
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
   auto *CanIVTy = CanonicalIVPHI->getScalarType();
@@ -2382,7 +2392,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
   // TODO: support unroll factor > 1.
   Plan.setUF(1);
-  return true;
 }
 
 void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
@@ -2806,13 +2815,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
   R->replaceAllUsesWith(PtrAdd);
 
   // Create the backedge value for the scalar pointer phi.
-  Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
+  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
+  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
   VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
                                        DL);
   VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
 
-  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
-  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
   VPValue *InductionGEP =
       Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
   ScalarPtrPhi->addOperand(InductionGEP);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ecaca72..cc50c75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -177,10 +177,9 @@ struct VPlanTransforms {
   /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
   /// VPCanonicalIVPHIRecipe is only used to control the loop after
   /// this transformation.
-  /// \returns true if the transformation succeeds, or false if it doesn't.
-  static bool
-  tryAddExplicitVectorLength(VPlan &Plan,
-                             const std::optional<unsigned> &MaxEVLSafeElements);
+  static void
+  addExplicitVectorLength(VPlan &Plan,
+                          const std::optional<unsigned> &MaxEVLSafeElements);
 
   // For each Interleave Group in \p InterleaveGroups replace the Recipes
   // widening its memory instructions with a single VPInterleaveRecipe at its
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 14ae4f2..3417e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           return VerifyEVLUse(*S, S->getNumOperands() - 1);
         })
         .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
-              VPWidenIntOrFpInductionRecipe>(
+              VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
             [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
         .Case<VPScalarIVStepsRecipe>([&](auto *R) {
           if (R->getNumOperands() != 3) {